Added constants for token abbrev

This commit is contained in:
Roberto Alsina 2024-08-06 20:13:23 -03:00
parent 82db232511
commit a9ff9bc8ac
3 changed files with 126 additions and 2 deletions

24
scripts/token_abbrevs.py Normal file
View File

@ -0,0 +1,24 @@
import sys
import string
# Run it as grep token lexers/* | python scripts/token_abbrevs.py
def abbr(line):
return "".join(c for c in line if c in string.ascii_uppercase).lower()
abbrevs = {}
tokens = set([])
for line in sys.stdin:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
print("Abbreviations: {")
for k, v in abbrevs.items():
print(f' "{k}" => "{v}",')
print("}")

91
src/constants.cr Normal file
View File

@ -0,0 +1,91 @@
module Tartrazine
Abbreviations = {
"Text" => "t",
"CommentSingle" => "cs",
"CommentSpecial" => "cs",
"NameVariable" => "nv",
"Keyword" => "k",
"NameFunction" => "nf",
"Punctuation" => "p",
"Operator" => "o",
"LiteralNumberInteger" => "lni",
"NameBuiltin" => "nb",
"Name" => "n",
"OperatorWord" => "ow",
"LiteralStringSingle" => "lss",
"Literal" => "l",
"NameClass" => "nc",
"CommentMultiline" => "cm",
"LiteralStringRegex" => "lsr",
"KeywordDeclaration" => "kd",
"KeywordConstant" => "kc",
"NameOther" => "no",
"LiteralNumberFloat" => "lnf",
"LiteralNumberHex" => "lnh",
"LiteralStringDouble" => "lsd",
"KeywordType" => "kt",
"NameNamespace" => "nn",
"NameAttribute" => "na",
"KeywordReserved" => "kr",
"CommentPreproc" => "cp",
"KeywordNamespace" => "kn",
"NameConstant" => "nc",
"NameLabel" => "nl",
"LiteralString" => "ls",
"LiteralStringChar" => "lsc",
"TextWhitespace" => "tw",
"LiteralStringEscape" => "lse",
"LiteralNumber" => "ln",
"Other" => "o",
"LiteralStringBoolean" => "lsb",
"NameProperty" => "np",
"Comment" => "c",
"NameTag" => "nt",
"LiteralStringOther" => "lso",
"NameVariableGlobal" => "nvg",
"NameBuiltinPseudo" => "nbp",
"LiteralNumberBin" => "lnb",
"KeywordPseudo" => "kp",
"CommentPreprocFile" => "cpf",
"LiteralStringAffix" => "lsa",
"LiteralStringDelimiter" => "lsd",
"LiteralNumberOct" => "lno",
"Error" => "e",
"Generic" => "g",
"LiteralNumberIntegerLong" => "lnil",
"NameDecorator" => "nd",
"LiteralStringInterpol" => "lsi",
"LiteralStringBacktick" => "lsb",
"GenericPrompt" => "gp",
"GenericOutput" => "go",
"LiteralStringName" => "lsn",
"LiteralStringHeredoc" => "lsh",
"LiteralStringSymbol" => "lss",
"NameVariableInstance" => "nvi",
"LiteralOther" => "lo",
"NameVariableClass" => "nvc",
"NameOperator" => "no",
"None" => "n",
"LiteralStringDoc" => "lsd",
"NameException" => "ne",
"GenericSubheading" => "gs",
"GenericStrong" => "gs",
"GenericDeleted" => "gd",
"GenericInserted" => "gi",
"GenericHeading" => "gh",
"NameEntity" => "ne",
"NamePseudo" => "np",
"CommentHashbang" => "ch",
"TextPunctuation" => "tp",
"NameVariableAnonymous" => "nva",
"NameVariableMagic" => "nvm",
"NameFunctionMagic" => "nfm",
"GenericEmph" => "ge",
"GenericUnderline" => "gu",
"LiteralStringAtom" => "lsa",
"LiteralDate" => "ld",
"GenericError" => "ge",
"TextSymbol" => "ts",
"NameKeyword" => "nk",
}
end

View File

@ -54,25 +54,34 @@ module Tartrazine
property state_stack = ["root"]
# Turn the text into a list of tokens.
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
# Loop through the text, applying rules
while pos < text.size
state = states[@state_stack.last]
Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break # We go back to processing with current state
break
end
Log.trace { "NOT MATCHED: #{rule.xml}" }
end