Refactor things into separate files for easier reading

2025-08-01 21:39:50 +00:00 · 2024-08-09 11:31:18 -03:00
parent a0ff4e0118
commit 47237eecc3
6 changed files with 277 additions and 270 deletions
--- a/src/formatter.cr
+++ b/src/formatter.cr
@@ -19,93 +19,4 @@ module Tartrazine
      raise Exception.new("Not implemented")
    end
  end
-
-  class Ansi < Formatter
-    def format(text : String, lexer : Lexer, theme : Theme) : String
-      output = String.build do |outp|
-        lexer.tokenize(text).each do |token|
-          outp << self.colorize(token[:value], token[:type], theme)
-        end
-      end
-      output
-    end
-
-    def colorize(text : String, token : String, theme : Theme) : String
-      style = theme.styles.fetch(token, nil)
-      return text if style.nil?
-      if theme.styles.has_key?(token)
-        s = theme.styles[token]
-      else
-        # Themes don't contain information for each specific
-        # token type. However, they may contain information
-        # for a parent style. Worst case, we go to the root
-        # (Background) style.
-        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
-          theme.styles.has_key?(parent)
-        }]
-      end
-      colorized = text.colorize
-      s.color.try { |c| colorized = colorized.fore(c.colorize) }
-      # Intentionally not setting background color
-      colorized.mode(:bold) if s.bold
-      colorized.mode(:italic) if s.italic
-      colorized.mode(:underline) if s.underline
-      colorized.to_s
-    end
-  end
-
-  class Html < Formatter
-    def format(text : String, lexer : Lexer, theme : Theme) : String
-      output = String.build do |outp|
-        outp << "<html><head><style>"
-        outp << get_style_defs(theme)
-        outp << "</style></head><body>"
-        outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
-        lexer.tokenize(text).each do |token|
-          fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
-          outp << fragment
-        end
-        outp << "</code></pre></body></html>"
-      end
-      output
-    end
-
-    # ameba:disable Metrics/CyclomaticComplexity
-    def get_style_defs(theme : Theme) : String
-      output = String.build do |outp|
-        theme.styles.each do |token, style|
-          outp << ".#{get_css_class(token, theme)} {"
-          # These are set or nil
-          outp << "color: #{style.color.try &.hex};" if style.color
-          outp << "background-color: #{style.background.try &.hex};" if style.background
-          outp << "border: 1px solid #{style.border.try &.hex};" if style.border
-
-          # These are true/false/nil
-          outp << "border: none;" if style.border == false
-          outp << "font-weight: bold;" if style.bold
-          outp << "font-weight: 400;" if style.bold == false
-          outp << "font-style: italic;" if style.italic
-          outp << "font-style: normal;" if style.italic == false
-          outp << "text-decoration: underline;" if style.underline
-          outp << "text-decoration: none;" if style.underline == false
-
-          outp << "}"
-        end
-      end
-      output
-    end
-
-    # Given a token type, return the CSS class to use.
-    def get_css_class(token, theme)
-      return Abbreviations[token] if theme.styles.has_key?(token)
-
-      # Themes don't contain information for each specific
-      # token type. However, they may contain information
-      # for a parent style. Worst case, we go to the root
-      # (Background) style.
-      Abbreviations[theme.style_parents(token).reverse.find { |parent|
-        theme.styles.has_key?(parent)
-      }]
-    end
-  end
 end
--- a/src/formatters/ansi.cr
+++ b/src/formatters/ansi.cr
@@ -0,0 +1,37 @@
+require "../formatter"
+
+module Tartrazine
+  class Ansi < Formatter
+    def format(text : String, lexer : Lexer, theme : Theme) : String
+      output = String.build do |outp|
+        lexer.tokenize(text).each do |token|
+          outp << self.colorize(token[:value], token[:type], theme)
+        end
+      end
+      output
+    end
+
+    def colorize(text : String, token : String, theme : Theme) : String
+      style = theme.styles.fetch(token, nil)
+      return text if style.nil?
+      if theme.styles.has_key?(token)
+        s = theme.styles[token]
+      else
+        # Themes don't contain information for each specific
+        # token type. However, they may contain information
+        # for a parent style. Worst case, we go to the root
+        # (Background) style.
+        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
+          theme.styles.has_key?(parent)
+        }]
+      end
+      colorized = text.colorize
+      s.color.try { |c| colorized = colorized.fore(c.colorize) }
+      # Intentionally not setting background color
+      colorized.mode(:bold) if s.bold
+      colorized.mode(:italic) if s.italic
+      colorized.mode(:underline) if s.underline
+      colorized.to_s
+    end
+  end
+end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -0,0 +1,59 @@
+require "../formatter"
+
+module Tartrazine
+    class Html < Formatter
+        def format(text : String, lexer : Lexer, theme : Theme) : String
+          output = String.build do |outp|
+            outp << "<html><head><style>"
+            outp << get_style_defs(theme)
+            outp << "</style></head><body>"
+            outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
+            lexer.tokenize(text).each do |token|
+              fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
+              outp << fragment
+            end
+            outp << "</code></pre></body></html>"
+          end
+          output
+        end
+    
+        # ameba:disable Metrics/CyclomaticComplexity
+        def get_style_defs(theme : Theme) : String
+          output = String.build do |outp|
+            theme.styles.each do |token, style|
+              outp << ".#{get_css_class(token, theme)} {"
+              # These are set or nil
+              outp << "color: #{style.color.try &.hex};" if style.color
+              outp << "background-color: #{style.background.try &.hex};" if style.background
+              outp << "border: 1px solid #{style.border.try &.hex};" if style.border
+    
+              # These are true/false/nil
+              outp << "border: none;" if style.border == false
+              outp << "font-weight: bold;" if style.bold
+              outp << "font-weight: 400;" if style.bold == false
+              outp << "font-style: italic;" if style.italic
+              outp << "font-style: normal;" if style.italic == false
+              outp << "text-decoration: underline;" if style.underline
+              outp << "text-decoration: none;" if style.underline == false
+    
+              outp << "}"
+            end
+          end
+          output
+        end
+    
+        # Given a token type, return the CSS class to use.
+        def get_css_class(token, theme)
+          return Abbreviations[token] if theme.styles.has_key?(token)
+    
+          # Themes don't contain information for each specific
+          # token type. However, they may contain information
+          # for a parent style. Worst case, we go to the root
+          # (Background) style.
+          Abbreviations[theme.style_parents(token).reverse.find { |parent|
+            theme.styles.has_key?(parent)
+          }]
+        end
+      end
+    
+end
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -0,0 +1,180 @@
+module Tartrazine
+  class LexerFiles
+    extend BakedFileSystem
+
+    bake_folder "../lexers", __DIR__
+  end
+
+  # This implements a lexer for Pygments RegexLexers as expressed
+  # in Chroma's XML serialization.
+  #
+  # For explanations on what actions and states do
+  # the Pygments documentation is a good place to start.
+  # https://pygments.org/docs/lexerdevelopment/
+  class Lexer
+    property config = {
+      name:             "",
+      aliases:          [] of String,
+      filenames:        [] of String,
+      mime_types:       [] of String,
+      priority:         0.0,
+      case_insensitive: false,
+      dot_all:          false,
+      not_multiline:    false,
+      ensure_nl:        false,
+    }
+    property xml : String = ""
+
+    property states = {} of String => State
+
+    property state_stack = ["root"]
+
+    # Turn the text into a list of tokens. The `usingself` parameter
+    # is true when the lexer is being used to tokenize a string
+    # from a larger text that is already being tokenized.
+    # So, when it's true, we don't modify the text.
+    def tokenize(text, usingself = false) : Array(Token)
+      @state_stack = ["root"]
+      tokens = [] of Token
+      pos = 0
+      matched = false
+
+      # Respect the `ensure_nl` config option
+      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
+        text += "\n"
+      end
+
+      # Loop through the text, applying rules
+      while pos < text.size
+        state = states[@state_stack.last]
+        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
+        state.rules.each do |rule|
+          matched, new_pos, new_tokens = rule.match(text, pos, self)
+          if matched
+            # Move position forward, save the tokens,
+            # tokenize from the new position
+            # Log.trace { "MATCHED: #{rule.xml}" }
+            pos = new_pos
+            tokens += new_tokens
+            break
+          end
+          # Log.trace { "NOT MATCHED: #{rule.xml}" }
+        end
+        # If no rule matches, emit an error token
+        unless matched
+          # Log.trace { "Error at #{pos}" }
+          tokens << {type: "Error", value: "#{text[pos]}"}
+          pos += 1
+        end
+      end
+      Lexer.collapse_tokens(tokens)
+    end
+
+    # Collapse consecutive tokens of the same type for easier comparison
+    # and smaller output
+    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
+      result = [] of Tartrazine::Token
+      tokens = tokens.reject { |token| token[:value] == "" }
+      tokens.each do |token|
+        if result.empty?
+          result << token
+          next
+        end
+        last = result.last
+        if last[:type] == token[:type]
+          new_token = {type: last[:type], value: last[:value] + token[:value]}
+          result.pop
+          result << new_token
+        else
+          result << token
+        end
+      end
+      result
+    end
+
+    # ameba:disable Metrics/CyclomaticComplexity
+    def self.from_xml(xml : String) : Lexer
+      l = Lexer.new
+      l.xml = xml
+      lexer = XML.parse(xml).first_element_child
+      if lexer
+        config = lexer.children.find { |node|
+          node.name == "config"
+        }
+        if config
+          l.config = {
+            name:             xml_to_s(config, name) || "",
+            aliases:          xml_to_a(config, _alias) || [] of String,
+            filenames:        xml_to_a(config, filename) || [] of String,
+            mime_types:       xml_to_a(config, mime_type) || [] of String,
+            priority:         xml_to_f(config, priority) || 0.0,
+            not_multiline:    xml_to_s(config, not_multiline) == "true",
+            dot_all:          xml_to_s(config, dot_all) == "true",
+            case_insensitive: xml_to_s(config, case_insensitive) == "true",
+            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
+          }
+        end
+
+        rules = lexer.children.find { |node|
+          node.name == "rules"
+        }
+        if rules
+          # Rules contains states 🤷
+          rules.children.select { |node|
+            node.name == "state"
+          }.each do |state_node|
+            state = State.new
+            state.name = state_node["name"]
+            if l.states.has_key?(state.name)
+              raise Exception.new("Duplicate state: #{state.name}")
+            else
+              l.states[state.name] = state
+            end
+            # And states contain rules 🤷
+            state_node.children.select { |node|
+              node.name == "rule"
+            }.each do |rule_node|
+              case rule_node["pattern"]?
+              when nil
+                if rule_node.first_element_child.try &.name == "include"
+                  rule = IncludeStateRule.new(rule_node)
+                else
+                  rule = UnconditionalRule.new(rule_node)
+                end
+              else
+                rule = Rule.new(rule_node,
+                  multiline: !l.config[:not_multiline],
+                  dotall: l.config[:dot_all],
+                  ignorecase: l.config[:case_insensitive])
+              end
+              state.rules << rule
+            end
+          end
+        end
+      end
+      l
+    end
+  end
+
+  # A Lexer state. A state has a name and a list of rules.
+  # The state machine has a state stack containing references
+  # to states to decide which rules to apply.
+  class State
+    property name : String = ""
+    property rules = [] of Rule
+
+    def +(other : State)
+      new_state = State.new
+      new_state.name = Random.base58(8)
+      new_state.rules = rules + other.rules
+      new_state
+    end
+  end
+
+  # A token, the output of the tokenizer
+  alias Token = NamedTuple(type: String, value: String)
+
+  def self.lexer(name : String) : Lexer
+    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
+  end
+end
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -3,7 +3,7 @@ require "./constants"
 require "./formatter"
 require "./rules"
 require "./styles"
-require "./tartrazine"
+require "./lexer"

 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -15,186 +15,6 @@ module Tartrazine
  VERSION = "0.1.1"

  Log = ::Log.for("tartrazine")
-
-  # This implements a lexer for Pygments RegexLexers as expressed
-  # in Chroma's XML serialization.
-  #
-  # For explanations on what actions and states do
-  # the Pygments documentation is a good place to start.
-  # https://pygments.org/docs/lexerdevelopment/
-
-  # A Lexer state. A state has a name and a list of rules.
-  # The state machine has a state stack containing references
-  # to states to decide which rules to apply.
-  class State
-    property name : String = ""
-    property rules = [] of Rule
-
-    def +(other : State)
-      new_state = State.new
-      new_state.name = Random.base58(8)
-      new_state.rules = rules + other.rules
-      new_state
-    end
-  end
-
-  class LexerFiles
-    extend BakedFileSystem
-
-    bake_folder "../lexers", __DIR__
-  end
-
-  # A token, the output of the tokenizer
-  alias Token = NamedTuple(type: String, value: String)
-
-  class Lexer
-    property config = {
-      name:             "",
-      aliases:          [] of String,
-      filenames:        [] of String,
-      mime_types:       [] of String,
-      priority:         0.0,
-      case_insensitive: false,
-      dot_all:          false,
-      not_multiline:    false,
-      ensure_nl:        false,
-    }
-    property xml : String = ""
-
-    property states = {} of String => State
-
-    property state_stack = ["root"]
-
-    # Turn the text into a list of tokens. The `usingself` parameter
-    # is true when the lexer is being used to tokenize a string
-    # from a larger text that is already being tokenized.
-    # So, when it's true, we don't modify the text.
-    def tokenize(text, usingself = false) : Array(Token)
-      @state_stack = ["root"]
-      tokens = [] of Token
-      pos = 0
-      matched = false
-
-      # Respect the `ensure_nl` config option
-      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
-        text += "\n"
-      end
-
-      # Loop through the text, applying rules
-      while pos < text.size
-        state = states[@state_stack.last]
-        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
-        state.rules.each do |rule|
-          matched, new_pos, new_tokens = rule.match(text, pos, self)
-          if matched
-            # Move position forward, save the tokens,
-            # tokenize from the new position
-            # Log.trace { "MATCHED: #{rule.xml}" }
-            pos = new_pos
-            tokens += new_tokens
-            break
-          end
-          # Log.trace { "NOT MATCHED: #{rule.xml}" }
-        end
-        # If no rule matches, emit an error token
-        unless matched
-          # Log.trace { "Error at #{pos}" }
-          tokens << {type: "Error", value: "#{text[pos]}"}
-          pos += 1
-        end
-      end
-      Lexer.collapse_tokens(tokens)
-    end
-
-    # Collapse consecutive tokens of the same type for easier comparison
-    # and smaller output
-    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
-      result = [] of Tartrazine::Token
-      tokens = tokens.reject { |token| token[:value] == "" }
-      tokens.each do |token|
-        if result.empty?
-          result << token
-          next
-        end
-        last = result.last
-        if last[:type] == token[:type]
-          new_token = {type: last[:type], value: last[:value] + token[:value]}
-          result.pop
-          result << new_token
-        else
-          result << token
-        end
-      end
-      result
-    end
-
-    # ameba:disable Metrics/CyclomaticComplexity
-    def self.from_xml(xml : String) : Lexer
-      l = Lexer.new
-      l.xml = xml
-      lexer = XML.parse(xml).first_element_child
-      if lexer
-        config = lexer.children.find { |node|
-          node.name == "config"
-        }
-        if config
-          l.config = {
-            name:             xml_to_s(config, name) || "",
-            aliases:          xml_to_a(config, _alias) || [] of String,
-            filenames:        xml_to_a(config, filename) || [] of String,
-            mime_types:       xml_to_a(config, mime_type) || [] of String,
-            priority:         xml_to_f(config, priority) || 0.0,
-            not_multiline:    xml_to_s(config, not_multiline) == "true",
-            dot_all:          xml_to_s(config, dot_all) == "true",
-            case_insensitive: xml_to_s(config, case_insensitive) == "true",
-            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
-          }
-        end
-
-        rules = lexer.children.find { |node|
-          node.name == "rules"
-        }
-        if rules
-          # Rules contains states 🤷
-          rules.children.select { |node|
-            node.name == "state"
-          }.each do |state_node|
-            state = State.new
-            state.name = state_node["name"]
-            if l.states.has_key?(state.name)
-              raise Exception.new("Duplicate state: #{state.name}")
-            else
-              l.states[state.name] = state
-            end
-            # And states contain rules 🤷
-            state_node.children.select { |node|
-              node.name == "rule"
-            }.each do |rule_node|
-              case rule_node["pattern"]?
-              when nil
-                if rule_node.first_element_child.try &.name == "include"
-                  rule = IncludeStateRule.new(rule_node)
-                else
-                  rule = UnconditionalRule.new(rule_node)
-                end
-              else
-                rule = Rule.new(rule_node,
-                  multiline: !l.config[:not_multiline],
-                  dotall: l.config[:dot_all],
-                  ignorecase: l.config[:case_insensitive])
-              end
-              state.rules << rule
-            end
-          end
-        end
-      end
-      l
-    end
-  end
-
-  def self.lexer(name : String) : Lexer
-    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
-  end
 end

 # Convenience macros to parse XML