diff --git a/scripts/token_abbrevs.py b/scripts/token_abbrevs.py new file mode 100644 index 0000000..2b0c91b --- /dev/null +++ b/scripts/token_abbrevs.py @@ -0,0 +1,24 @@ +import sys +import string + +# Run it as grep token lexers/* | python scripts/token_abbrevs.py + + +def abbr(line): + return "".join(c for c in line if c in string.ascii_uppercase).lower() + +abbrevs = {} +tokens = set([]) +for line in sys.stdin: + if " "{v}",') +print("}") diff --git a/src/constants.cr b/src/constants.cr new file mode 100644 index 0000000..2479df9 --- /dev/null +++ b/src/constants.cr @@ -0,0 +1,91 @@ +module Tartrazine + Abbreviations = { + "Text" => "t", + "CommentSingle" => "cs", + "CommentSpecial" => "cs", + "NameVariable" => "nv", + "Keyword" => "k", + "NameFunction" => "nf", + "Punctuation" => "p", + "Operator" => "o", + "LiteralNumberInteger" => "lni", + "NameBuiltin" => "nb", + "Name" => "n", + "OperatorWord" => "ow", + "LiteralStringSingle" => "lss", + "Literal" => "l", + "NameClass" => "nc", + "CommentMultiline" => "cm", + "LiteralStringRegex" => "lsr", + "KeywordDeclaration" => "kd", + "KeywordConstant" => "kc", + "NameOther" => "no", + "LiteralNumberFloat" => "lnf", + "LiteralNumberHex" => "lnh", + "LiteralStringDouble" => "lsd", + "KeywordType" => "kt", + "NameNamespace" => "nn", + "NameAttribute" => "na", + "KeywordReserved" => "kr", + "CommentPreproc" => "cp", + "KeywordNamespace" => "kn", + "NameConstant" => "nc", + "NameLabel" => "nl", + "LiteralString" => "ls", + "LiteralStringChar" => "lsc", + "TextWhitespace" => "tw", + "LiteralStringEscape" => "lse", + "LiteralNumber" => "ln", + "Other" => "o", + "LiteralStringBoolean" => "lsb", + "NameProperty" => "np", + "Comment" => "c", + "NameTag" => "nt", + "LiteralStringOther" => "lso", + "NameVariableGlobal" => "nvg", + "NameBuiltinPseudo" => "nbp", + "LiteralNumberBin" => "lnb", + "KeywordPseudo" => "kp", + "CommentPreprocFile" => "cpf", + "LiteralStringAffix" => "lsa", + "LiteralStringDelimiter" => "lsd", + "LiteralNumberOct" => "lno", + "Error" => "e", + "Generic" => "g", + "LiteralNumberIntegerLong" => "lnil", + "NameDecorator" => "nd", + "LiteralStringInterpol" => "lsi", + "LiteralStringBacktick" => "lsb", + "GenericPrompt" => "gp", + "GenericOutput" => "go", + "LiteralStringName" => "lsn", + "LiteralStringHeredoc" => "lsh", + "LiteralStringSymbol" => "lss", + "NameVariableInstance" => "nvi", + "LiteralOther" => "lo", + "NameVariableClass" => "nvc", + "NameOperator" => "no", + "None" => "n", + "LiteralStringDoc" => "lsd", + "NameException" => "ne", + "GenericSubheading" => "gs", + "GenericStrong" => "gs", + "GenericDeleted" => "gd", + "GenericInserted" => "gi", + "GenericHeading" => "gh", + "NameEntity" => "ne", + "NamePseudo" => "np", + "CommentHashbang" => "ch", + "TextPunctuation" => "tp", + "NameVariableAnonymous" => "nva", + "NameVariableMagic" => "nvm", + "NameFunctionMagic" => "nfm", + "GenericEmph" => "ge", + "GenericUnderline" => "gu", + "LiteralStringAtom" => "lsa", + "LiteralDate" => "ld", + "GenericError" => "ge", + "TextSymbol" => "ts", + "NameKeyword" => "nk", + } +end diff --git a/src/tartrazine.cr b/src/tartrazine.cr index aa3bc87..8630a94 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -54,25 +54,34 @@ module Tartrazine property state_stack = ["root"] - # Turn the text into a list of tokens. + # Turn the text into a list of tokens. The `usingself` parameter + # is true when the lexer is being used to tokenize a string + # from a larger text that is already being tokenized. + # So, when it's true, we don't modify the text. def tokenize(text, usingself = false) : Array(Token) @state_stack = ["root"] tokens = [] of Token pos = 0 matched = false + + # Respect the `ensure_nl` config option if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself text += "\n" end + + # Loop through the text, applying rules while pos < text.size state = states[@state_stack.last] Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" } state.rules.each do |rule| matched, new_pos, new_tokens = rule.match(text, pos, self) if matched + # Move position forward, save the tokens, + # tokenize from the new position Log.trace { "MATCHED: #{rule.xml}" } pos = new_pos tokens += new_tokens - break # We go back to processing with current state + break end Log.trace { "NOT MATCHED: #{rule.xml}" } end