Added constants for token abbrev

2025-09-07 06:13:08 +00:00 · 2024-08-06 20:13:23 -03:00
parent 82db232511
commit a9ff9bc8ac
3 changed files with 126 additions and 2 deletions
--- a/scripts/token_abbrevs.py
+++ b/scripts/token_abbrevs.py
@@ -0,0 +1,24 @@
 import sys
 import string
 # Run it as grep token lexers/* | python scripts/token_abbrevs.py
 def abbr(line):
    return "".join(c for c in line if c in string.ascii_uppercase).lower()
 abbrevs = {}
 tokens = set([])
 for line in sys.stdin:
    if "<token" not in line:
        continue
    line = line.strip()
    line = line.split('<token ',1)[-1]
    line = line.split('"')[1]
    abbrevs[line] = abbr(line)
    tokens.add(line)
 print("Abbreviations: {")
 for k, v in abbrevs.items():
    print(f'    "{k}" => "{v}",')
 print("}")
--- a/src/constants.cr
+++ b/src/constants.cr
@@ -0,0 +1,91 @@
 module Tartrazine
  Abbreviations = {
    "Text"                     => "t",
    "CommentSingle"            => "cs",
    "CommentSpecial"           => "cs",
    "NameVariable"             => "nv",
    "Keyword"                  => "k",
    "NameFunction"             => "nf",
    "Punctuation"              => "p",
    "Operator"                 => "o",
    "LiteralNumberInteger"     => "lni",
    "NameBuiltin"              => "nb",
    "Name"                     => "n",
    "OperatorWord"             => "ow",
    "LiteralStringSingle"      => "lss",
    "Literal"                  => "l",
    "NameClass"                => "nc",
    "CommentMultiline"         => "cm",
    "LiteralStringRegex"       => "lsr",
    "KeywordDeclaration"       => "kd",
    "KeywordConstant"          => "kc",
    "NameOther"                => "no",
    "LiteralNumberFloat"       => "lnf",
    "LiteralNumberHex"         => "lnh",
    "LiteralStringDouble"      => "lsd",
    "KeywordType"              => "kt",
    "NameNamespace"            => "nn",
    "NameAttribute"            => "na",
    "KeywordReserved"          => "kr",
    "CommentPreproc"           => "cp",
    "KeywordNamespace"         => "kn",
    "NameConstant"             => "nc",
    "NameLabel"                => "nl",
    "LiteralString"            => "ls",
    "LiteralStringChar"        => "lsc",
    "TextWhitespace"           => "tw",
    "LiteralStringEscape"      => "lse",
    "LiteralNumber"            => "ln",
    "Other"                    => "o",
    "LiteralStringBoolean"     => "lsb",
    "NameProperty"             => "np",
    "Comment"                  => "c",
    "NameTag"                  => "nt",
    "LiteralStringOther"       => "lso",
    "NameVariableGlobal"       => "nvg",
    "NameBuiltinPseudo"        => "nbp",
    "LiteralNumberBin"         => "lnb",
    "KeywordPseudo"            => "kp",
    "CommentPreprocFile"       => "cpf",
    "LiteralStringAffix"       => "lsa",
    "LiteralStringDelimiter"   => "lsd",
    "LiteralNumberOct"         => "lno",
    "Error"                    => "e",
    "Generic"                  => "g",
    "LiteralNumberIntegerLong" => "lnil",
    "NameDecorator"            => "nd",
    "LiteralStringInterpol"    => "lsi",
    "LiteralStringBacktick"    => "lsb",
    "GenericPrompt"            => "gp",
    "GenericOutput"            => "go",
    "LiteralStringName"        => "lsn",
    "LiteralStringHeredoc"     => "lsh",
    "LiteralStringSymbol"      => "lss",
    "NameVariableInstance"     => "nvi",
    "LiteralOther"             => "lo",
    "NameVariableClass"        => "nvc",
    "NameOperator"             => "no",
    "None"                     => "n",
    "LiteralStringDoc"         => "lsd",
    "NameException"            => "ne",
    "GenericSubheading"        => "gs",
    "GenericStrong"            => "gs",
    "GenericDeleted"           => "gd",
    "GenericInserted"          => "gi",
    "GenericHeading"           => "gh",
    "NameEntity"               => "ne",
    "NamePseudo"               => "np",
    "CommentHashbang"          => "ch",
    "TextPunctuation"          => "tp",
    "NameVariableAnonymous"    => "nva",
    "NameVariableMagic"        => "nvm",
    "NameFunctionMagic"        => "nfm",
    "GenericEmph"              => "ge",
    "GenericUnderline"         => "gu",
    "LiteralStringAtom"        => "lsa",
    "LiteralDate"              => "ld",
    "GenericError"             => "ge",
    "TextSymbol"               => "ts",
    "NameKeyword"              => "nk",
  }
 end
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -54,25 +54,34 @@ module Tartrazine
    property state_stack = ["root"]
-    # Turn the text into a list of tokens.
+    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
    def tokenize(text, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
      matched = false
      # Respect the `ensure_nl` config option
      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
        text += "\n"
      end
      # Loop through the text, applying rules
      while pos < text.size
        state = states[@state_stack.last]
        Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
          matched, new_pos, new_tokens = rule.match(text, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
-            break # We go back to processing with current state
+            break
          end
          Log.trace { "NOT MATCHED: #{rule.xml}" }
        end