diff --git a/src/formatter.cr b/src/formatter.cr index a64863d..418c4c9 100644 --- a/src/formatter.cr +++ b/src/formatter.cr @@ -19,93 +19,4 @@ module Tartrazine raise Exception.new("Not implemented") end end - - class Ansi < Formatter - def format(text : String, lexer : Lexer, theme : Theme) : String - output = String.build do |outp| - lexer.tokenize(text).each do |token| - outp << self.colorize(token[:value], token[:type], theme) - end - end - output - end - - def colorize(text : String, token : String, theme : Theme) : String - style = theme.styles.fetch(token, nil) - return text if style.nil? - if theme.styles.has_key?(token) - s = theme.styles[token] - else - # Themes don't contain information for each specific - # token type. However, they may contain information - # for a parent style. Worst case, we go to the root - # (Background) style. - s = theme.styles[theme.style_parents(token).reverse.find { |parent| - theme.styles.has_key?(parent) - }] - end - colorized = text.colorize - s.color.try { |c| colorized = colorized.fore(c.colorize) } - # Intentionally not setting background color - colorized.mode(:bold) if s.bold - colorized.mode(:italic) if s.italic - colorized.mode(:underline) if s.underline - colorized.to_s - end - end - - class Html < Formatter - def format(text : String, lexer : Lexer, theme : Theme) : String - output = String.build do |outp| - outp << "" - outp << "
"
-        lexer.tokenize(text).each do |token|
-          fragment = "#{token[:value]}"
-          outp << fragment
-        end
-        outp << "
" - end - output - end - - # ameba:disable Metrics/CyclomaticComplexity - def get_style_defs(theme : Theme) : String - output = String.build do |outp| - theme.styles.each do |token, style| - outp << ".#{get_css_class(token, theme)} {" - # These are set or nil - outp << "color: #{style.color.try &.hex};" if style.color - outp << "background-color: #{style.background.try &.hex};" if style.background - outp << "border: 1px solid #{style.border.try &.hex};" if style.border - - # These are true/false/nil - outp << "border: none;" if style.border == false - outp << "font-weight: bold;" if style.bold - outp << "font-weight: 400;" if style.bold == false - outp << "font-style: italic;" if style.italic - outp << "font-style: normal;" if style.italic == false - outp << "text-decoration: underline;" if style.underline - outp << "text-decoration: none;" if style.underline == false - - outp << "}" - end - end - output - end - - # Given a token type, return the CSS class to use. - def get_css_class(token, theme) - return Abbreviations[token] if theme.styles.has_key?(token) - - # Themes don't contain information for each specific - # token type. However, they may contain information - # for a parent style. Worst case, we go to the root - # (Background) style. - Abbreviations[theme.style_parents(token).reverse.find { |parent| - theme.styles.has_key?(parent) - }] - end - end end diff --git a/src/formatters/ansi.cr b/src/formatters/ansi.cr new file mode 100644 index 0000000..ba8b740 --- /dev/null +++ b/src/formatters/ansi.cr @@ -0,0 +1,37 @@ +require "../formatter" + +module Tartrazine + class Ansi < Formatter + def format(text : String, lexer : Lexer, theme : Theme) : String + output = String.build do |outp| + lexer.tokenize(text).each do |token| + outp << self.colorize(token[:value], token[:type], theme) + end + end + output + end + + def colorize(text : String, token : String, theme : Theme) : String + style = theme.styles.fetch(token, nil) + return text if style.nil? + if theme.styles.has_key?(token) + s = theme.styles[token] + else + # Themes don't contain information for each specific + # token type. However, they may contain information + # for a parent style. Worst case, we go to the root + # (Background) style. + s = theme.styles[theme.style_parents(token).reverse.find { |parent| + theme.styles.has_key?(parent) + }] + end + colorized = text.colorize + s.color.try { |c| colorized = colorized.fore(c.colorize) } + # Intentionally not setting background color + colorized.mode(:bold) if s.bold + colorized.mode(:italic) if s.italic + colorized.mode(:underline) if s.underline + colorized.to_s + end + end +end diff --git a/src/formatters/html.cr b/src/formatters/html.cr new file mode 100644 index 0000000..6110f36 --- /dev/null +++ b/src/formatters/html.cr @@ -0,0 +1,59 @@ +require "../formatter" + +module Tartrazine + class Html < Formatter + def format(text : String, lexer : Lexer, theme : Theme) : String + output = String.build do |outp| + outp << "" + outp << "
"
+            lexer.tokenize(text).each do |token|
+              fragment = "#{token[:value]}"
+              outp << fragment
+            end
+            outp << "
" + end + output + end + + # ameba:disable Metrics/CyclomaticComplexity + def get_style_defs(theme : Theme) : String + output = String.build do |outp| + theme.styles.each do |token, style| + outp << ".#{get_css_class(token, theme)} {" + # These are set or nil + outp << "color: #{style.color.try &.hex};" if style.color + outp << "background-color: #{style.background.try &.hex};" if style.background + outp << "border: 1px solid #{style.border.try &.hex};" if style.border + + # These are true/false/nil + outp << "border: none;" if style.border == false + outp << "font-weight: bold;" if style.bold + outp << "font-weight: 400;" if style.bold == false + outp << "font-style: italic;" if style.italic + outp << "font-style: normal;" if style.italic == false + outp << "text-decoration: underline;" if style.underline + outp << "text-decoration: none;" if style.underline == false + + outp << "}" + end + end + output + end + + # Given a token type, return the CSS class to use. + def get_css_class(token, theme) + return Abbreviations[token] if theme.styles.has_key?(token) + + # Themes don't contain information for each specific + # token type. However, they may contain information + # for a parent style. Worst case, we go to the root + # (Background) style. + Abbreviations[theme.style_parents(token).reverse.find { |parent| + theme.styles.has_key?(parent) + }] + end + end + +end \ No newline at end of file diff --git a/src/lexer.cr b/src/lexer.cr new file mode 100644 index 0000000..c18ab31 --- /dev/null +++ b/src/lexer.cr @@ -0,0 +1,180 @@ +module Tartrazine + class LexerFiles + extend BakedFileSystem + + bake_folder "../lexers", __DIR__ + end + + # This implements a lexer for Pygments RegexLexers as expressed + # in Chroma's XML serialization. + # + # For explanations on what actions and states do + # the Pygments documentation is a good place to start. + # https://pygments.org/docs/lexerdevelopment/ + class Lexer + property config = { + name: "", + aliases: [] of String, + filenames: [] of String, + mime_types: [] of String, + priority: 0.0, + case_insensitive: false, + dot_all: false, + not_multiline: false, + ensure_nl: false, + } + property xml : String = "" + + property states = {} of String => State + + property state_stack = ["root"] + + # Turn the text into a list of tokens. The `usingself` parameter + # is true when the lexer is being used to tokenize a string + # from a larger text that is already being tokenized. + # So, when it's true, we don't modify the text. + def tokenize(text, usingself = false) : Array(Token) + @state_stack = ["root"] + tokens = [] of Token + pos = 0 + matched = false + + # Respect the `ensure_nl` config option + if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself + text += "\n" + end + + # Loop through the text, applying rules + while pos < text.size + state = states[@state_stack.last] + # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" } + state.rules.each do |rule| + matched, new_pos, new_tokens = rule.match(text, pos, self) + if matched + # Move position forward, save the tokens, + # tokenize from the new position + # Log.trace { "MATCHED: #{rule.xml}" } + pos = new_pos + tokens += new_tokens + break + end + # Log.trace { "NOT MATCHED: #{rule.xml}" } + end + # If no rule matches, emit an error token + unless matched + # Log.trace { "Error at #{pos}" } + tokens << {type: "Error", value: "#{text[pos]}"} + pos += 1 + end + end + Lexer.collapse_tokens(tokens) + end + + # Collapse consecutive tokens of the same type for easier comparison + # and smaller output + def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) + result = [] of Tartrazine::Token + tokens = tokens.reject { |token| token[:value] == "" } + tokens.each do |token| + if result.empty? + result << token + next + end + last = result.last + if last[:type] == token[:type] + new_token = {type: last[:type], value: last[:value] + token[:value]} + result.pop + result << new_token + else + result << token + end + end + result + end + + # ameba:disable Metrics/CyclomaticComplexity + def self.from_xml(xml : String) : Lexer + l = Lexer.new + l.xml = xml + lexer = XML.parse(xml).first_element_child + if lexer + config = lexer.children.find { |node| + node.name == "config" + } + if config + l.config = { + name: xml_to_s(config, name) || "", + aliases: xml_to_a(config, _alias) || [] of String, + filenames: xml_to_a(config, filename) || [] of String, + mime_types: xml_to_a(config, mime_type) || [] of String, + priority: xml_to_f(config, priority) || 0.0, + not_multiline: xml_to_s(config, not_multiline) == "true", + dot_all: xml_to_s(config, dot_all) == "true", + case_insensitive: xml_to_s(config, case_insensitive) == "true", + ensure_nl: xml_to_s(config, ensure_nl) == "true", + } + end + + rules = lexer.children.find { |node| + node.name == "rules" + } + if rules + # Rules contains states 🤷 + rules.children.select { |node| + node.name == "state" + }.each do |state_node| + state = State.new + state.name = state_node["name"] + if l.states.has_key?(state.name) + raise Exception.new("Duplicate state: #{state.name}") + else + l.states[state.name] = state + end + # And states contain rules 🤷 + state_node.children.select { |node| + node.name == "rule" + }.each do |rule_node| + case rule_node["pattern"]? + when nil + if rule_node.first_element_child.try &.name == "include" + rule = IncludeStateRule.new(rule_node) + else + rule = UnconditionalRule.new(rule_node) + end + else + rule = Rule.new(rule_node, + multiline: !l.config[:not_multiline], + dotall: l.config[:dot_all], + ignorecase: l.config[:case_insensitive]) + end + state.rules << rule + end + end + end + end + l + end + end + + # A Lexer state. A state has a name and a list of rules. + # The state machine has a state stack containing references + # to states to decide which rules to apply. + class State + property name : String = "" + property rules = [] of Rule + + def +(other : State) + new_state = State.new + new_state.name = Random.base58(8) + new_state.rules = rules + other.rules + new_state + end + end + + # A token, the output of the tokenizer + alias Token = NamedTuple(type: String, value: String) + + def self.lexer(name : String) : Lexer + Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end) + end +end diff --git a/src/rules.cr b/src/rules.cr index 7761e4a..d84e85b 100644 --- a/src/rules.cr +++ b/src/rules.cr @@ -3,7 +3,7 @@ require "./constants" require "./formatter" require "./rules" require "./styles" -require "./tartrazine" +require "./lexer" # These are lexer rules. They match with the text being parsed # and perform actions, either emitting tokens or changing the diff --git a/src/tartrazine.cr b/src/tartrazine.cr index e6a7a84..8665f0f 100644 --- a/src/tartrazine.cr +++ b/src/tartrazine.cr @@ -15,186 +15,6 @@ module Tartrazine VERSION = "0.1.1" Log = ::Log.for("tartrazine") - - # This implements a lexer for Pygments RegexLexers as expressed - # in Chroma's XML serialization. - # - # For explanations on what actions and states do - # the Pygments documentation is a good place to start. - # https://pygments.org/docs/lexerdevelopment/ - - # A Lexer state. A state has a name and a list of rules. - # The state machine has a state stack containing references - # to states to decide which rules to apply. - class State - property name : String = "" - property rules = [] of Rule - - def +(other : State) - new_state = State.new - new_state.name = Random.base58(8) - new_state.rules = rules + other.rules - new_state - end - end - - class LexerFiles - extend BakedFileSystem - - bake_folder "../lexers", __DIR__ - end - - # A token, the output of the tokenizer - alias Token = NamedTuple(type: String, value: String) - - class Lexer - property config = { - name: "", - aliases: [] of String, - filenames: [] of String, - mime_types: [] of String, - priority: 0.0, - case_insensitive: false, - dot_all: false, - not_multiline: false, - ensure_nl: false, - } - property xml : String = "" - - property states = {} of String => State - - property state_stack = ["root"] - - # Turn the text into a list of tokens. The `usingself` parameter - # is true when the lexer is being used to tokenize a string - # from a larger text that is already being tokenized. - # So, when it's true, we don't modify the text. - def tokenize(text, usingself = false) : Array(Token) - @state_stack = ["root"] - tokens = [] of Token - pos = 0 - matched = false - - # Respect the `ensure_nl` config option - if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself - text += "\n" - end - - # Loop through the text, applying rules - while pos < text.size - state = states[@state_stack.last] - # Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" } - state.rules.each do |rule| - matched, new_pos, new_tokens = rule.match(text, pos, self) - if matched - # Move position forward, save the tokens, - # tokenize from the new position - # Log.trace { "MATCHED: #{rule.xml}" } - pos = new_pos - tokens += new_tokens - break - end - # Log.trace { "NOT MATCHED: #{rule.xml}" } - end - # If no rule matches, emit an error token - unless matched - # Log.trace { "Error at #{pos}" } - tokens << {type: "Error", value: "#{text[pos]}"} - pos += 1 - end - end - Lexer.collapse_tokens(tokens) - end - - # Collapse consecutive tokens of the same type for easier comparison - # and smaller output - def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token) - result = [] of Tartrazine::Token - tokens = tokens.reject { |token| token[:value] == "" } - tokens.each do |token| - if result.empty? - result << token - next - end - last = result.last - if last[:type] == token[:type] - new_token = {type: last[:type], value: last[:value] + token[:value]} - result.pop - result << new_token - else - result << token - end - end - result - end - - # ameba:disable Metrics/CyclomaticComplexity - def self.from_xml(xml : String) : Lexer - l = Lexer.new - l.xml = xml - lexer = XML.parse(xml).first_element_child - if lexer - config = lexer.children.find { |node| - node.name == "config" - } - if config - l.config = { - name: xml_to_s(config, name) || "", - aliases: xml_to_a(config, _alias) || [] of String, - filenames: xml_to_a(config, filename) || [] of String, - mime_types: xml_to_a(config, mime_type) || [] of String, - priority: xml_to_f(config, priority) || 0.0, - not_multiline: xml_to_s(config, not_multiline) == "true", - dot_all: xml_to_s(config, dot_all) == "true", - case_insensitive: xml_to_s(config, case_insensitive) == "true", - ensure_nl: xml_to_s(config, ensure_nl) == "true", - } - end - - rules = lexer.children.find { |node| - node.name == "rules" - } - if rules - # Rules contains states 🤷 - rules.children.select { |node| - node.name == "state" - }.each do |state_node| - state = State.new - state.name = state_node["name"] - if l.states.has_key?(state.name) - raise Exception.new("Duplicate state: #{state.name}") - else - l.states[state.name] = state - end - # And states contain rules 🤷 - state_node.children.select { |node| - node.name == "rule" - }.each do |rule_node| - case rule_node["pattern"]? - when nil - if rule_node.first_element_child.try &.name == "include" - rule = IncludeStateRule.new(rule_node) - else - rule = UnconditionalRule.new(rule_node) - end - else - rule = Rule.new(rule_node, - multiline: !l.config[:not_multiline], - dotall: l.config[:dot_all], - ignorecase: l.config[:case_insensitive]) - end - state.rules << rule - end - end - end - end - l - end - end - - def self.lexer(name : String) : Lexer - Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end) - end end # Convenience macros to parse XML