mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
refactor
This commit is contained in:
parent
ab263ac26f
commit
70cfbef572
102
src/actions.cr
Normal file
102
src/actions.cr
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
# These are Lexer actions. When a rule matches, it will
|
||||||
|
# perform a list of actions. These actions can emit tokens
|
||||||
|
# or change the state machine.
|
||||||
|
module Tartrazine
|
||||||
|
class Action
|
||||||
|
property type : String
|
||||||
|
property xml : XML::Node
|
||||||
|
property actions : Array(Action) = [] of Action
|
||||||
|
|
||||||
|
def initialize(@type : String, @xml : XML::Node?)
|
||||||
|
# Some actions may have actions in them, like this:
|
||||||
|
# <bygroups>
|
||||||
|
# <token type="GenericPrompt"/>
|
||||||
|
# <token type="Text"/>
|
||||||
|
# <using lexer="bash"/>
|
||||||
|
# </bygroups>
|
||||||
|
#
|
||||||
|
# The token actions match with the first 2 groups in the regex
|
||||||
|
# the using action matches the 3rd and shunts it to another lexer
|
||||||
|
@xml.children.each do |node|
|
||||||
|
next unless node.element?
|
||||||
|
@actions << Action.new(node.name, node)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
||||||
|
case type
|
||||||
|
when "token"
|
||||||
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||||
|
[Token.new(type: xml["type"], value: match[match_group])]
|
||||||
|
when "push"
|
||||||
|
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
|
||||||
|
if states_to_push.empty?
|
||||||
|
# Push without a state means push the current state
|
||||||
|
states_to_push = [lexer.state_stack.last]
|
||||||
|
end
|
||||||
|
states_to_push.each do |state|
|
||||||
|
if state == "#pop"
|
||||||
|
# Pop the state
|
||||||
|
# puts "Popping state"
|
||||||
|
lexer.state_stack.pop
|
||||||
|
else
|
||||||
|
# Really push
|
||||||
|
lexer.state_stack << state
|
||||||
|
# puts "Pushed #{lexer.state_stack}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
[] of Token
|
||||||
|
when "pop"
|
||||||
|
depth = xml["depth"].to_i
|
||||||
|
# puts "Popping #{depth} states"
|
||||||
|
if lexer.state_stack.size <= depth
|
||||||
|
# puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}"
|
||||||
|
else
|
||||||
|
lexer.state_stack.pop(depth)
|
||||||
|
end
|
||||||
|
[] of Token
|
||||||
|
when "bygroups"
|
||||||
|
# FIXME: handle
|
||||||
|
# ><bygroups>
|
||||||
|
# <token type="Punctuation"/>
|
||||||
|
# None
|
||||||
|
# <token type="LiteralStringRegex"/>
|
||||||
|
#
|
||||||
|
# where that None means skipping a group
|
||||||
|
#
|
||||||
|
raise Exception.new "Can't have a token without a match" if match.nil?
|
||||||
|
|
||||||
|
# Each group matches an action
|
||||||
|
|
||||||
|
result = [] of Token
|
||||||
|
@actions.each_with_index do |e, i|
|
||||||
|
next if match[i + 1]?.nil?
|
||||||
|
result += e.emit(match, lexer, i + 1)
|
||||||
|
end
|
||||||
|
result
|
||||||
|
when "using"
|
||||||
|
# Shunt to another lexer entirely
|
||||||
|
return [] of Token if match.nil?
|
||||||
|
lexer_name = xml["lexer"].downcase
|
||||||
|
# pp! "to tokenize:", match[match_group]
|
||||||
|
LEXERS[lexer_name].tokenize(match[match_group], usingself: true)
|
||||||
|
when "usingself"
|
||||||
|
# Shunt to another copy of this lexer
|
||||||
|
return [] of Token if match.nil?
|
||||||
|
|
||||||
|
new_lexer = Lexer.from_xml(lexer.xml)
|
||||||
|
# pp! "to tokenize:", match[match_group]
|
||||||
|
new_lexer.tokenize(match[match_group], usingself: true)
|
||||||
|
when "combined"
|
||||||
|
# Combine two states into one anonymous state
|
||||||
|
states = xml.attributes.select { |a| a.name == "state" }.map &.content
|
||||||
|
new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
|
||||||
|
lexer.states[new_state.name] = new_state
|
||||||
|
lexer.state_stack << new_state.name
|
||||||
|
[] of Token
|
||||||
|
else
|
||||||
|
raise Exception.new("Unknown action type: #{type}: #{xml}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
25
src/rules.cr
25
src/rules.cr
@ -1,3 +1,5 @@
|
|||||||
|
require "./actions"
|
||||||
|
|
||||||
# These are lexer rules. They match with the text being parsed
|
# These are lexer rules. They match with the text being parsed
|
||||||
# and perform actions, either emitting tokens or changing the
|
# and perform actions, either emitting tokens or changing the
|
||||||
# state of the lexer.
|
# state of the lexer.
|
||||||
@ -5,7 +7,7 @@ module Tartrazine
|
|||||||
# This rule matches via a regex pattern
|
# This rule matches via a regex pattern
|
||||||
class Rule
|
class Rule
|
||||||
property pattern : Regex = Regex.new ""
|
property pattern : Regex = Regex.new ""
|
||||||
property emitters : Array(Emitter) = [] of Emitter
|
property actions : Array(Action) = [] of Action
|
||||||
property xml : String = "foo"
|
property xml : String = "foo"
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
@ -16,9 +18,9 @@ module Tartrazine
|
|||||||
# pp! match, pattern.inspect, text, pos
|
# pp! match, pattern.inspect, text, pos
|
||||||
return false, pos, [] of Token if match.nil? || match.end == 0
|
return false, pos, [] of Token if match.nil? || match.end == 0
|
||||||
# Emit the tokens
|
# Emit the tokens
|
||||||
emitters.each do |emitter|
|
actions.each do |action|
|
||||||
# Emit the token
|
# Emit the token
|
||||||
tokens += emitter.emit(match, lexer)
|
tokens += action.emit(match, lexer)
|
||||||
end
|
end
|
||||||
# p! xml, match.end, tokens
|
# p! xml, match.end, tokens
|
||||||
return true, match.end, tokens
|
return true, match.end, tokens
|
||||||
@ -27,13 +29,13 @@ module Tartrazine
|
|||||||
def initialize(node : XML::Node, flags)
|
def initialize(node : XML::Node, flags)
|
||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
@pattern = Regex.new(node["pattern"], flags)
|
@pattern = Regex.new(node["pattern"], flags)
|
||||||
add_emitters(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
|
|
||||||
def add_emitters(node : XML::Node)
|
def add_actions(node : XML::Node)
|
||||||
node.children.each do |node|
|
node.children.each do |node|
|
||||||
next unless node.element?
|
next unless node.element?
|
||||||
@emitters << Emitter.new(node.name, node)
|
@actions << Action.new(node.name, node)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -57,24 +59,23 @@ module Tartrazine
|
|||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
include_node = node.children.find { |n| n.name == "include" }
|
include_node = node.children.find { |n| n.name == "include" }
|
||||||
@state = include_node["state"] if include_node
|
@state = include_node["state"] if include_node
|
||||||
add_emitters(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# This rule always matches, unconditionally
|
# This rule always matches, unconditionally
|
||||||
class Always < Rule
|
class UnconditionalRule < Rule
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
tokens = [] of Token
|
tokens = [] of Token
|
||||||
emitters.each do |emitter|
|
actions.each do |action|
|
||||||
tokens += emitter.emit(nil, lexer)
|
tokens += action.emit(nil, lexer)
|
||||||
end
|
end
|
||||||
return true, pos, tokens
|
return true, pos, tokens
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node)
|
def initialize(node : XML::Node)
|
||||||
@xml = node.to_s
|
@xml = node.to_s
|
||||||
add_emitters(node)
|
add_actions(node)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
@ -2,13 +2,14 @@ require "base58"
|
|||||||
require "json"
|
require "json"
|
||||||
require "xml"
|
require "xml"
|
||||||
require "./rules"
|
require "./rules"
|
||||||
|
require "./actions"
|
||||||
module Tartrazine
|
module Tartrazine
|
||||||
VERSION = "0.1.0"
|
VERSION = "0.1.0"
|
||||||
|
|
||||||
# This implements a lexer for Pygments RegexLexers as expressed
|
# This implements a lexer for Pygments RegexLexers as expressed
|
||||||
# in Chroma's XML serialization.
|
# in Chroma's XML serialization.
|
||||||
#
|
#
|
||||||
# For explanations on what emitters, transformers, etc do
|
# For explanations on what actions, transformers, etc do
|
||||||
# the Pygments documentation is a good place to start.
|
# the Pygments documentation is a good place to start.
|
||||||
# https://pygments.org/docs/lexerdevelopment/
|
# https://pygments.org/docs/lexerdevelopment/
|
||||||
class State
|
class State
|
||||||
@ -24,103 +25,6 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
class Emitter
|
|
||||||
property type : String
|
|
||||||
property xml : XML::Node
|
|
||||||
property emitters : Array(Emitter) = [] of Emitter
|
|
||||||
|
|
||||||
def initialize(@type : String, @xml : XML::Node?)
|
|
||||||
# Some emitters may have emitters in them, like this:
|
|
||||||
# <bygroups>
|
|
||||||
# <token type="GenericPrompt"/>
|
|
||||||
# <token type="Text"/>
|
|
||||||
# <using lexer="bash"/>
|
|
||||||
# </bygroups>
|
|
||||||
#
|
|
||||||
# The token emitters match with the first 2 groups in the regex
|
|
||||||
# the using emitter matches the 3rd and shunts it to another lexer
|
|
||||||
@xml.children.each do |node|
|
|
||||||
next unless node.element?
|
|
||||||
@emitters << Emitter.new(node.name, node)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
|
|
||||||
case type
|
|
||||||
when "token"
|
|
||||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
|
||||||
[Token.new(type: xml["type"], value: match[match_group])]
|
|
||||||
when "push"
|
|
||||||
states_to_push = xml.attributes.select { |a| a.name == "state" }.map &.content
|
|
||||||
if states_to_push.empty?
|
|
||||||
# Push without a state means push the current state
|
|
||||||
states_to_push = [lexer.state_stack.last]
|
|
||||||
end
|
|
||||||
states_to_push.each do |state|
|
|
||||||
if state == "#pop"
|
|
||||||
# Pop the state
|
|
||||||
# puts "Popping state"
|
|
||||||
lexer.state_stack.pop
|
|
||||||
else
|
|
||||||
# Really push
|
|
||||||
lexer.state_stack << state
|
|
||||||
# puts "Pushed #{lexer.state_stack}"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
[] of Token
|
|
||||||
when "pop"
|
|
||||||
depth = xml["depth"].to_i
|
|
||||||
# puts "Popping #{depth} states"
|
|
||||||
if lexer.state_stack.size <= depth
|
|
||||||
# puts "Can't pop #{depth} states, only have #{lexer.state_stack.size}"
|
|
||||||
else
|
|
||||||
lexer.state_stack.pop(depth)
|
|
||||||
end
|
|
||||||
[] of Token
|
|
||||||
when "bygroups"
|
|
||||||
# FIXME: handle
|
|
||||||
# ><bygroups>
|
|
||||||
# <token type="Punctuation"/>
|
|
||||||
# None
|
|
||||||
# <token type="LiteralStringRegex"/>
|
|
||||||
#
|
|
||||||
# where that None means skipping a group
|
|
||||||
#
|
|
||||||
raise Exception.new "Can't have a token without a match" if match.nil?
|
|
||||||
|
|
||||||
# Each group matches an emitter
|
|
||||||
|
|
||||||
result = [] of Token
|
|
||||||
@emitters.each_with_index do |e, i|
|
|
||||||
next if match[i + 1]?.nil?
|
|
||||||
result += e.emit(match, lexer, i + 1)
|
|
||||||
end
|
|
||||||
result
|
|
||||||
when "using"
|
|
||||||
# Shunt to another lexer entirely
|
|
||||||
return [] of Token if match.nil?
|
|
||||||
lexer_name = xml["lexer"].downcase
|
|
||||||
# pp! "to tokenize:", match[match_group]
|
|
||||||
LEXERS[lexer_name].tokenize(match[match_group], usingself: true)
|
|
||||||
when "usingself"
|
|
||||||
# Shunt to another copy of this lexer
|
|
||||||
return [] of Token if match.nil?
|
|
||||||
|
|
||||||
new_lexer = Lexer.from_xml(lexer.xml)
|
|
||||||
# pp! "to tokenize:", match[match_group]
|
|
||||||
new_lexer.tokenize(match[match_group], usingself: true)
|
|
||||||
when "combined"
|
|
||||||
# Combine two states into one anonymous state
|
|
||||||
states = xml.attributes.select { |a| a.name == "state" }.map &.content
|
|
||||||
new_state = states.map { |name| lexer.states[name] }.reduce { |s1, s2| s1 + s2 }
|
|
||||||
lexer.states[new_state.name] = new_state
|
|
||||||
lexer.state_stack << new_state.name
|
|
||||||
[] of Token
|
|
||||||
else
|
|
||||||
raise Exception.new("Unknown emitter type: #{type}: #{xml}")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
alias Token = NamedTuple(type: String, value: String)
|
alias Token = NamedTuple(type: String, value: String)
|
||||||
|
|
||||||
@ -215,7 +119,7 @@ module Tartrazine
|
|||||||
if rule_node.first_element_child.try &.name == "include"
|
if rule_node.first_element_child.try &.name == "include"
|
||||||
rule = IncludeStateRule.new(rule_node)
|
rule = IncludeStateRule.new(rule_node)
|
||||||
else
|
else
|
||||||
rule = Always.new(rule_node)
|
rule = UnconditionalRule.new(rule_node)
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
flags = Regex::Options::ANCHORED
|
flags = Regex::Options::ANCHORED
|
||||||
|
Loading…
Reference in New Issue
Block a user