This commit is contained in:
Roberto Alsina 2024-08-04 19:54:25 -03:00
parent ab263ac26f
commit 70cfbef572
3 changed files with 119 additions and 112 deletions

102
src/actions.cr Normal file
View File

@ -0,0 +1,102 @@
# These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens
# or change the state machine.
module Tartrazine
class Action
property type : String
property xml : XML::Node
property actions : Array(Action) = [] of Action
def initialize(@type : String, @xml : XML::Node?)
# Some actions may have actions in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
# <token type="Text"/>
# <using lexer="bash"/>
# </bygroups>
#
# The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer
@xml.children.each do |node|
next unless node.element?
@actions << Action.new(node.name, node)
end
end
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
when "push"
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state
# puts "Popping state"
lexer.state_stack.pop
else
# Really push
lexer.state_stack << state
# puts "Pushed #{lexer.state_stack}"
end
end
[] of Token
when "pop"
depth = xml["depth"].to_i
# puts "Popping #{depth} states"
if lexer.state_stack.size <= depth
# puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}"
else
lexer.state_stack.pop(depth)
end
[] of Token
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil?
# Each group matches an action
result = [] of Token
@actions.each_with_index do |e, i|
next if match[i + 1]?.nil?
result += e.emit(match, lexer, i + 1)
end
result
when "using"
# Shunt to another lexer entirely
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
# pp! "to tokenize:", match[match_group]
LEXERS[lexer_name].tokenize(match[match_group], usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
# pp! "to tokenize:", match[match_group]
new_lexer.tokenize(match[match_group], usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |a| a.name == "state" }.map &.content
new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name
[] of Token
else
raise Exception.new("Unknown action type: #{type}: #{xml}")
end
end
end
end

View File

@ -1,3 +1,5 @@
require "./actions"
# These are lexer rules. They match with the text being parsed # These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the # and perform actions, either emitting tokens or changing the
# state of the lexer. # state of the lexer.
@ -5,7 +7,7 @@ module Tartrazine
# This rule matches via a regex pattern # This rule matches via a regex pattern
class Rule class Rule
property pattern : Regex = Regex.new "" property pattern : Regex = Regex.new ""
property emitters : Array(Emitter) = [] of Emitter property actions : Array(Action) = [] of Action
property xml : String = "foo" property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
@ -16,9 +18,9 @@ module Tartrazine
# pp! match, pattern.inspect, text, pos # pp! match, pattern.inspect, text, pos
return false, pos, [] of Token if match.nil? || match.end == 0 return false, pos, [] of Token if match.nil? || match.end == 0
# Emit the tokens # Emit the tokens
emitters.each do |emitter| actions.each do |action|
# Emit the token # Emit the token
tokens += emitter.emit(match, lexer) tokens += action.emit(match, lexer)
end end
# p! xml, match.end, tokens # p! xml, match.end, tokens
return true, match.end, tokens return true, match.end, tokens
@ -27,13 +29,13 @@ module Tartrazine
def initialize(node : XML::Node, flags) def initialize(node : XML::Node, flags)
@xml = node.to_s @xml = node.to_s
@pattern = Regex.new(node["pattern"], flags) @pattern = Regex.new(node["pattern"], flags)
add_emitters(node) add_actions(node)
end end
def add_emitters(node : XML::Node) def add_actions(node : XML::Node)
node.children.each do |node| node.children.each do |node|
next unless node.element? next unless node.element?
@emitters << Emitter.new(node.name, node) @actions << Action.new(node.name, node)
end end
end end
end end
@ -57,24 +59,23 @@ module Tartrazine
@xml = node.to_s @xml = node.to_s
include_node = node.children.find { |n| n.name == "include" } include_node = node.children.find { |n| n.name == "include" }
@state = include_node["state"] if include_node @state = include_node["state"] if include_node
add_emitters(node) add_actions(node)
end end
end end
# This rule always matches, unconditionally # This rule always matches, unconditionally
class Always < Rule class UnconditionalRule < Rule
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token tokens = [] of Token
emitters.each do |emitter| actions.each do |action|
tokens += emitter.emit(nil, lexer) tokens += action.emit(nil, lexer)
end end
return true, pos, tokens return true, pos, tokens
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@xml = node.to_s @xml = node.to_s
add_emitters(node) add_actions(node)
end end
end end
end end

View File

@ -2,13 +2,14 @@ require "base58"
require "json" require "json"
require "xml" require "xml"
require "./rules" require "./rules"
require "./actions"
module Tartrazine module Tartrazine
VERSION = "0.1.0" VERSION = "0.1.0"
# This implements a lexer for Pygments RegexLexers as expressed # This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization. # in Chroma's XML serialization.
# #
# For explanations on what emitters, transformers, etc do # For explanations on what actions, transformers, etc do
# the Pygments documentation is a good place to start. # the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/ # https://pygments.org/docs/lexerdevelopment/
class State class State
@ -24,103 +25,6 @@ module Tartrazine
end end
class Emitter
property type : String
property xml : XML::Node
property emitters : Array(Emitter) = [] of Emitter
def initialize(@type : String, @xml : XML::Node?)
# Some emitters may have emitters in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
# <token type="Text"/>
# <using lexer="bash"/>
# </bygroups>
#
# The token emitters match with the first 2 groups in the regex
# the using emitter matches the 3rd and shunts it to another lexer
@xml.children.each do |node|
next unless node.element?
@emitters << Emitter.new(node.name, node)
end
end
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
when "push"
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state
# puts "Popping state"
lexer.state_stack.pop
else
# Really push
lexer.state_stack << state
# puts "Pushed #{lexer.state_stack}"
end
end
[] of Token
when "pop"
depth = xml["depth"].to_i
# puts "Popping #{depth} states"
if lexer.state_stack.size <= depth
# puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}"
else
lexer.state_stack.pop(depth)
end
[] of Token
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil?
# Each group matches an emitter
result = [] of Token
@emitters.each_with_index do |e, i|
next if match[i + 1]?.nil?
result += e.emit(match, lexer, i + 1)
end
result
when "using"
# Shunt to another lexer entirely
return [] of Token if match.nil?
lexer_name = xml["lexer"].downcase
# pp! "to tokenize:", match[match_group]
LEXERS[lexer_name].tokenize(match[match_group], usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
new_lexer = Lexer.from_xml(lexer.xml)
# pp! "to tokenize:", match[match_group]
new_lexer.tokenize(match[match_group], usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |a| a.name == "state" }.map &.content
new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name
[] of Token
else
raise Exception.new("Unknown emitter type: #{type}: #{xml}")
end
end
end
alias Token = NamedTuple(type: String, value: String) alias Token = NamedTuple(type: String, value: String)
@ -215,7 +119,7 @@ module Tartrazine
if rule_node.first_element_child.try &.name == "include" if rule_node.first_element_child.try &.name == "include"
rule = IncludeStateRule.new(rule_node) rule = IncludeStateRule.new(rule_node)
else else
rule = Always.new(rule_node) rule = UnconditionalRule.new(rule_node)
end end
else else
flags = Regex::Options::ANCHORED flags = Regex::Options::ANCHORED