Refactor things into separate files for easier reading

2025-08-01 21:39:50 +00:00 · 2024-08-09 11:31:18 -03:00
parent a0ff4e0118
commit 47237eecc3
6 changed files with 277 additions and 270 deletions
--- a/src/formatter.cr
+++ b/src/formatter.cr
@@ -19,93 +19,4 @@ module Tartrazine
      raise Exception.new("Not implemented")
    end
  end
  class Ansi < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        lexer.tokenize(text).each do |token|
          outp << self.colorize(token[:value], token[:type], theme)
        end
      end
      output
    end
    def colorize(text : String, token : String, theme : Theme) : String
      style = theme.styles.fetch(token, nil)
      return text if style.nil?
      if theme.styles.has_key?(token)
        s = theme.styles[token]
      else
        # Themes don't contain information for each specific
        # token type. However, they may contain information
        # for a parent style. Worst case, we go to the root
        # (Background) style.
        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
          theme.styles.has_key?(parent)
        }]
      end
      colorized = text.colorize
      s.color.try { |c| colorized = colorized.fore(c.colorize) }
      # Intentionally not setting background color
      colorized.mode(:bold) if s.bold
      colorized.mode(:italic) if s.italic
      colorized.mode(:underline) if s.underline
      colorized.to_s
    end
  end
  class Html < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        outp << "<html><head><style>"
        outp << get_style_defs(theme)
        outp << "</style></head><body>"
        outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
        lexer.tokenize(text).each do |token|
          fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
          outp << fragment
        end
        outp << "</code></pre></body></html>"
      end
      output
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def get_style_defs(theme : Theme) : String
      output = String.build do |outp|
        theme.styles.each do |token, style|
          outp << ".#{get_css_class(token, theme)} {"
          # These are set or nil
          outp << "color: #{style.color.try &.hex};" if style.color
          outp << "background-color: #{style.background.try &.hex};" if style.background
          outp << "border: 1px solid #{style.border.try &.hex};" if style.border
          # These are true/false/nil
          outp << "border: none;" if style.border == false
          outp << "font-weight: bold;" if style.bold
          outp << "font-weight: 400;" if style.bold == false
          outp << "font-style: italic;" if style.italic
          outp << "font-style: normal;" if style.italic == false
          outp << "text-decoration: underline;" if style.underline
          outp << "text-decoration: none;" if style.underline == false
          outp << "}"
        end
      end
      output
    end
    # Given a token type, return the CSS class to use.
    def get_css_class(token, theme)
      return Abbreviations[token] if theme.styles.has_key?(token)
      # Themes don't contain information for each specific
      # token type. However, they may contain information
      # for a parent style. Worst case, we go to the root
      # (Background) style.
      Abbreviations[theme.style_parents(token).reverse.find { |parent|
        theme.styles.has_key?(parent)
      }]
    end
  end
 end
--- a/src/formatters/ansi.cr
+++ b/src/formatters/ansi.cr
@@ -0,0 +1,37 @@
 require "../formatter"
 module Tartrazine
  class Ansi < Formatter
    def format(text : String, lexer : Lexer, theme : Theme) : String
      output = String.build do |outp|
        lexer.tokenize(text).each do |token|
          outp << self.colorize(token[:value], token[:type], theme)
        end
      end
      output
    end
    def colorize(text : String, token : String, theme : Theme) : String
      style = theme.styles.fetch(token, nil)
      return text if style.nil?
      if theme.styles.has_key?(token)
        s = theme.styles[token]
      else
        # Themes don't contain information for each specific
        # token type. However, they may contain information
        # for a parent style. Worst case, we go to the root
        # (Background) style.
        s = theme.styles[theme.style_parents(token).reverse.find { |parent|
          theme.styles.has_key?(parent)
        }]
      end
      colorized = text.colorize
      s.color.try { |c| colorized = colorized.fore(c.colorize) }
      # Intentionally not setting background color
      colorized.mode(:bold) if s.bold
      colorized.mode(:italic) if s.italic
      colorized.mode(:underline) if s.underline
      colorized.to_s
    end
  end
 end
--- a/src/formatters/html.cr
+++ b/src/formatters/html.cr
@@ -0,0 +1,59 @@
 require "../formatter"
 module Tartrazine
    class Html < Formatter
        def format(text : String, lexer : Lexer, theme : Theme) : String
          output = String.build do |outp|
            outp << "<html><head><style>"
            outp << get_style_defs(theme)
            outp << "</style></head><body>"
            outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
            lexer.tokenize(text).each do |token|
              fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
              outp << fragment
            end
            outp << "</code></pre></body></html>"
          end
          output
        end
        # ameba:disable Metrics/CyclomaticComplexity
        def get_style_defs(theme : Theme) : String
          output = String.build do |outp|
            theme.styles.each do |token, style|
              outp << ".#{get_css_class(token, theme)} {"
              # These are set or nil
              outp << "color: #{style.color.try &.hex};" if style.color
              outp << "background-color: #{style.background.try &.hex};" if style.background
              outp << "border: 1px solid #{style.border.try &.hex};" if style.border
              # These are true/false/nil
              outp << "border: none;" if style.border == false
              outp << "font-weight: bold;" if style.bold
              outp << "font-weight: 400;" if style.bold == false
              outp << "font-style: italic;" if style.italic
              outp << "font-style: normal;" if style.italic == false
              outp << "text-decoration: underline;" if style.underline
              outp << "text-decoration: none;" if style.underline == false
              outp << "}"
            end
          end
          output
        end
        # Given a token type, return the CSS class to use.
        def get_css_class(token, theme)
          return Abbreviations[token] if theme.styles.has_key?(token)
          # Themes don't contain information for each specific
          # token type. However, they may contain information
          # for a parent style. Worst case, we go to the root
          # (Background) style.
          Abbreviations[theme.style_parents(token).reverse.find { |parent|
            theme.styles.has_key?(parent)
          }]
        end
      end
 end
--- a/src/lexer.cr
+++ b/src/lexer.cr
@@ -0,0 +1,180 @@
 module Tartrazine
  class LexerFiles
    extend BakedFileSystem
    bake_folder "../lexers", __DIR__
  end
  # This implements a lexer for Pygments RegexLexers as expressed
  # in Chroma's XML serialization.
  #
  # For explanations on what actions and states do
  # the Pygments documentation is a good place to start.
  # https://pygments.org/docs/lexerdevelopment/
  class Lexer
    property config = {
      name:             "",
      aliases:          [] of String,
      filenames:        [] of String,
      mime_types:       [] of String,
      priority:         0.0,
      case_insensitive: false,
      dot_all:          false,
      not_multiline:    false,
      ensure_nl:        false,
    }
    property xml : String = ""
    property states = {} of String => State
    property state_stack = ["root"]
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
    def tokenize(text, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
      matched = false
      # Respect the `ensure_nl` config option
      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
        text += "\n"
      end
      # Loop through the text, applying rules
      while pos < text.size
        state = states[@state_stack.last]
        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
          matched, new_pos, new_tokens = rule.match(text, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
          # Log.trace { "Error at #{pos}" }
          tokens << {type: "Error", value: "#{text[pos]}"}
          pos += 1
        end
      end
      Lexer.collapse_tokens(tokens)
    end
    # Collapse consecutive tokens of the same type for easier comparison
    # and smaller output
    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
      result = [] of Tartrazine::Token
      tokens = tokens.reject { |token| token[:value] == "" }
      tokens.each do |token|
        if result.empty?
          result << token
          next
        end
        last = result.last
        if last[:type] == token[:type]
          new_token = {type: last[:type], value: last[:value] + token[:value]}
          result.pop
          result << new_token
        else
          result << token
        end
      end
      result
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
          node.name == "config"
        }
        if config
          l.config = {
            name:             xml_to_s(config, name) || "",
            aliases:          xml_to_a(config, _alias) || [] of String,
            filenames:        xml_to_a(config, filename) || [] of String,
            mime_types:       xml_to_a(config, mime_type) || [] of String,
            priority:         xml_to_f(config, priority) || 0.0,
            not_multiline:    xml_to_s(config, not_multiline) == "true",
            dot_all:          xml_to_s(config, dot_all) == "true",
            case_insensitive: xml_to_s(config, case_insensitive) == "true",
            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
          }
        end
        rules = lexer.children.find { |node|
          node.name == "rules"
        }
        if rules
          # Rules contains states 🤷
          rules.children.select { |node|
            node.name == "state"
          }.each do |state_node|
            state = State.new
            state.name = state_node["name"]
            if l.states.has_key?(state.name)
              raise Exception.new("Duplicate state: #{state.name}")
            else
              l.states[state.name] = state
            end
            # And states contain rules 🤷
            state_node.children.select { |node|
              node.name == "rule"
            }.each do |rule_node|
              case rule_node["pattern"]?
              when nil
                if rule_node.first_element_child.try &.name == "include"
                  rule = IncludeStateRule.new(rule_node)
                else
                  rule = UnconditionalRule.new(rule_node)
                end
              else
                rule = Rule.new(rule_node,
                  multiline: !l.config[:not_multiline],
                  dotall: l.config[:dot_all],
                  ignorecase: l.config[:case_insensitive])
              end
              state.rules << rule
            end
          end
        end
      end
      l
    end
  end
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
  class State
    property name : String = ""
    property rules = [] of Rule
    def +(other : State)
      new_state = State.new
      new_state.name = Random.base58(8)
      new_state.rules = rules + other.rules
      new_state
    end
  end
  # A token, the output of the tokenizer
  alias Token = NamedTuple(type: String, value: String)
  def self.lexer(name : String) : Lexer
    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
  end
 end
--- a/src/rules.cr
+++ b/src/rules.cr
@@ -3,7 +3,7 @@ require "./constants"
 require "./formatter"
 require "./rules"
 require "./styles"
-require "./tartrazine"
+require "./lexer"
 # These are lexer rules. They match with the text being parsed
 # and perform actions, either emitting tokens or changing the
--- a/src/tartrazine.cr
+++ b/src/tartrazine.cr
@@ -15,186 +15,6 @@ module Tartrazine
  VERSION = "0.1.1"
  Log = ::Log.for("tartrazine")
  # This implements a lexer for Pygments RegexLexers as expressed
  # in Chroma's XML serialization.
  #
  # For explanations on what actions and states do
  # the Pygments documentation is a good place to start.
  # https://pygments.org/docs/lexerdevelopment/
  # A Lexer state. A state has a name and a list of rules.
  # The state machine has a state stack containing references
  # to states to decide which rules to apply.
  class State
    property name : String = ""
    property rules = [] of Rule
    def +(other : State)
      new_state = State.new
      new_state.name = Random.base58(8)
      new_state.rules = rules + other.rules
      new_state
    end
  end
  class LexerFiles
    extend BakedFileSystem
    bake_folder "../lexers", __DIR__
  end
  # A token, the output of the tokenizer
  alias Token = NamedTuple(type: String, value: String)
  class Lexer
    property config = {
      name:             "",
      aliases:          [] of String,
      filenames:        [] of String,
      mime_types:       [] of String,
      priority:         0.0,
      case_insensitive: false,
      dot_all:          false,
      not_multiline:    false,
      ensure_nl:        false,
    }
    property xml : String = ""
    property states = {} of String => State
    property state_stack = ["root"]
    # Turn the text into a list of tokens. The `usingself` parameter
    # is true when the lexer is being used to tokenize a string
    # from a larger text that is already being tokenized.
    # So, when it's true, we don't modify the text.
    def tokenize(text, usingself = false) : Array(Token)
      @state_stack = ["root"]
      tokens = [] of Token
      pos = 0
      matched = false
      # Respect the `ensure_nl` config option
      if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
        text += "\n"
      end
      # Loop through the text, applying rules
      while pos < text.size
        state = states[@state_stack.last]
        # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
        state.rules.each do |rule|
          matched, new_pos, new_tokens = rule.match(text, pos, self)
          if matched
            # Move position forward, save the tokens,
            # tokenize from the new position
            # Log.trace { "MATCHED: #{rule.xml}" }
            pos = new_pos
            tokens += new_tokens
            break
          end
          # Log.trace { "NOT MATCHED: #{rule.xml}" }
        end
        # If no rule matches, emit an error token
        unless matched
          # Log.trace { "Error at #{pos}" }
          tokens << {type: "Error", value: "#{text[pos]}"}
          pos += 1
        end
      end
      Lexer.collapse_tokens(tokens)
    end
    # Collapse consecutive tokens of the same type for easier comparison
    # and smaller output
    def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
      result = [] of Tartrazine::Token
      tokens = tokens.reject { |token| token[:value] == "" }
      tokens.each do |token|
        if result.empty?
          result << token
          next
        end
        last = result.last
        if last[:type] == token[:type]
          new_token = {type: last[:type], value: last[:value] + token[:value]}
          result.pop
          result << new_token
        else
          result << token
        end
      end
      result
    end
    # ameba:disable Metrics/CyclomaticComplexity
    def self.from_xml(xml : String) : Lexer
      l = Lexer.new
      l.xml = xml
      lexer = XML.parse(xml).first_element_child
      if lexer
        config = lexer.children.find { |node|
          node.name == "config"
        }
        if config
          l.config = {
            name:             xml_to_s(config, name) || "",
            aliases:          xml_to_a(config, _alias) || [] of String,
            filenames:        xml_to_a(config, filename) || [] of String,
            mime_types:       xml_to_a(config, mime_type) || [] of String,
            priority:         xml_to_f(config, priority) || 0.0,
            not_multiline:    xml_to_s(config, not_multiline) == "true",
            dot_all:          xml_to_s(config, dot_all) == "true",
            case_insensitive: xml_to_s(config, case_insensitive) == "true",
            ensure_nl:        xml_to_s(config, ensure_nl) == "true",
          }
        end
        rules = lexer.children.find { |node|
          node.name == "rules"
        }
        if rules
          # Rules contains states 🤷
          rules.children.select { |node|
            node.name == "state"
          }.each do |state_node|
            state = State.new
            state.name = state_node["name"]
            if l.states.has_key?(state.name)
              raise Exception.new("Duplicate state: #{state.name}")
            else
              l.states[state.name] = state
            end
            # And states contain rules 🤷
            state_node.children.select { |node|
              node.name == "rule"
            }.each do |rule_node|
              case rule_node["pattern"]?
              when nil
                if rule_node.first_element_child.try &.name == "include"
                  rule = IncludeStateRule.new(rule_node)
                else
                  rule = UnconditionalRule.new(rule_node)
                end
              else
                rule = Rule.new(rule_node,
                  multiline: !l.config[:not_multiline],
                  dotall: l.config[:dot_all],
                  ignorecase: l.config[:case_insensitive])
              end
              state.rules << rule
            end
          end
        end
      end
      l
    end
  end
  def self.lexer(name : String) : Lexer
    Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
  end
 end
 # Convenience macros to parse XML