From a9ff9bc8ac187857c7526c6c6ca5bfbcb6555622 Mon Sep 17 00:00:00 2001
From: Roberto Alsina <roberto.alsina@gmail.com>
Date: Tue, 6 Aug 2024 20:13:23 -0300
Subject: [PATCH] Added constants for token abbrev

---
 scripts/token_abbrevs.py | 24 +++++++++++
 src/constants.cr         | 91 ++++++++++++++++++++++++++++++++++++++++
 src/tartrazine.cr        | 13 +++++-
 3 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 scripts/token_abbrevs.py
 create mode 100644 src/constants.cr
diff --git a/scripts/token_abbrevs.py b/scripts/token_abbrevs.py
new file mode 100644
index 0000000..2b0c91b
--- /dev/null
+++ b/scripts/token_abbrevs.py
@@ -0,0 +1,24 @@
+import sys
+import string
+
+# Run it as grep token lexers/* | python scripts/token_abbrevs.py
+
+
+def abbr(line):
+    return "".join(c for c in line if c in string.ascii_uppercase).lower()
+
+abbrevs = {}
+tokens = set([])
+for line in sys.stdin:
+    if "<token" not in line:
+        continue
+    line = line.strip()
+    line = line.split('<token ',1)[-1]
+    line = line.split('"')[1]
+    abbrevs[line] = abbr(line)
+    tokens.add(line)
+
+print("Abbreviations: {")
+for k, v in abbrevs.items():
+    print(f'    "{k}" => "{v}",')
+print("}")
diff --git a/src/constants.cr b/src/constants.cr
new file mode 100644
index 0000000..2479df9
--- /dev/null
+++ b/src/constants.cr
@@ -0,0 +1,91 @@
+module Tartrazine
+  Abbreviations = {
+    "Text"                     => "t",
+    "CommentSingle"            => "cs",
+    "CommentSpecial"           => "cs",
+    "NameVariable"             => "nv",
+    "Keyword"                  => "k",
+    "NameFunction"             => "nf",
+    "Punctuation"              => "p",
+    "Operator"                 => "o",
+    "LiteralNumberInteger"     => "lni",
+    "NameBuiltin"              => "nb",
+    "Name"                     => "n",
+    "OperatorWord"             => "ow",
+    "LiteralStringSingle"      => "lss",
+    "Literal"                  => "l",
+    "NameClass"                => "nc",
+    "CommentMultiline"         => "cm",
+    "LiteralStringRegex"       => "lsr",
+    "KeywordDeclaration"       => "kd",
+    "KeywordConstant"          => "kc",
+    "NameOther"                => "no",
+    "LiteralNumberFloat"       => "lnf",
+    "LiteralNumberHex"         => "lnh",
+    "LiteralStringDouble"      => "lsd",
+    "KeywordType"              => "kt",
+    "NameNamespace"            => "nn",
+    "NameAttribute"            => "na",
+    "KeywordReserved"          => "kr",
+    "CommentPreproc"           => "cp",
+    "KeywordNamespace"         => "kn",
+    "NameConstant"             => "nc",
+    "NameLabel"                => "nl",
+    "LiteralString"            => "ls",
+    "LiteralStringChar"        => "lsc",
+    "TextWhitespace"           => "tw",
+    "LiteralStringEscape"      => "lse",
+    "LiteralNumber"            => "ln",
+    "Other"                    => "o",
+    "LiteralStringBoolean"     => "lsb",
+    "NameProperty"             => "np",
+    "Comment"                  => "c",
+    "NameTag"                  => "nt",
+    "LiteralStringOther"       => "lso",
+    "NameVariableGlobal"       => "nvg",
+    "NameBuiltinPseudo"        => "nbp",
+    "LiteralNumberBin"         => "lnb",
+    "KeywordPseudo"            => "kp",
+    "CommentPreprocFile"       => "cpf",
+    "LiteralStringAffix"       => "lsa",
+    "LiteralStringDelimiter"   => "lsd",
+    "LiteralNumberOct"         => "lno",
+    "Error"                    => "e",
+    "Generic"                  => "g",
+    "LiteralNumberIntegerLong" => "lnil",
+    "NameDecorator"            => "nd",
+    "LiteralStringInterpol"    => "lsi",
+    "LiteralStringBacktick"    => "lsb",
+    "GenericPrompt"            => "gp",
+    "GenericOutput"            => "go",
+    "LiteralStringName"        => "lsn",
+    "LiteralStringHeredoc"     => "lsh",
+    "LiteralStringSymbol"      => "lss",
+    "NameVariableInstance"     => "nvi",
+    "LiteralOther"             => "lo",
+    "NameVariableClass"        => "nvc",
+    "NameOperator"             => "no",
+    "None"                     => "n",
+    "LiteralStringDoc"         => "lsd",
+    "NameException"            => "ne",
+    "GenericSubheading"        => "gs",
+    "GenericStrong"            => "gs",
+    "GenericDeleted"           => "gd",
+    "GenericInserted"          => "gi",
+    "GenericHeading"           => "gh",
+    "NameEntity"               => "ne",
+    "NamePseudo"               => "np",
+    "CommentHashbang"          => "ch",
+    "TextPunctuation"          => "tp",
+    "NameVariableAnonymous"    => "nva",
+    "NameVariableMagic"        => "nvm",
+    "NameFunctionMagic"        => "nfm",
+    "GenericEmph"              => "ge",
+    "GenericUnderline"         => "gu",
+    "LiteralStringAtom"        => "lsa",
+    "LiteralDate"              => "ld",
+    "GenericError"             => "ge",
+    "TextSymbol"               => "ts",
+    "NameKeyword"              => "nk",
+  }
+end
diff --git a/src/tartrazine.cr b/src/tartrazine.cr
index aa3bc87..8630a94 100644
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -54,25 +54,34 @@ module Tartrazine
 
     property state_stack = ["root"]
 
-    # Turn the text into a list of tokens.
+    # Turn the text into a list of tokens. The `usingself` parameter
+    # is true when the lexer is being used to tokenize a string
+    # from a larger text that is already being tokenized.
+    # So, when it's true, we don't modify the text.
     def tokenize(text, usingself = false) : Array(Token)
       @state_stack = ["root"]
       tokens = [] of Token
       pos = 0
       matched = false
+
+      # Respect the `ensure_nl` config option
       if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
         text += "\n"
       end
+
+      # Loop through the text, applying rules
       while pos < text.size
         state = states[@state_stack.last]
         Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
         state.rules.each do |rule|
           matched, new_pos, new_tokens = rule.match(text, pos, self)
           if matched
+            # Move position forward, save the tokens,
+            # tokenize from the new position
             Log.trace { "MATCHED: #{rule.xml}" }
             pos = new_pos
             tokens += new_tokens
-            break # We go back to processing with current state
+            break
           end
           Log.trace { "NOT MATCHED: #{rule.xml}" }
         end