Compare commits

..

No commits in common. "788577b226bd707d240dd07bb2122cabca0e2f9e" and "cb09dff9f16261661e23d79639ec1ad2e140e854" have entirely different histories.

4 changed files with 58 additions and 59 deletions

View File

@ -8,29 +8,19 @@ require "./tartrazine"
# perform a list of actions. These actions can emit tokens # perform a list of actions. These actions can emit tokens
# or change the state machine. # or change the state machine.
module Tartrazine module Tartrazine
enum ActionType
Bygroups
Combined
Include
Pop
Push
Token
Using
Usingself
end
struct Action struct Action
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property type : String
@depth : Int32 = 0 @depth : Int32 = 0
@lexer_name : String = "" @lexer_name : String = ""
@states : Array(String) = [] of String @states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String @states_to_push : Array(String) = [] of String
@token_type : String = "" @token_type : String = ""
@type : ActionType = ActionType::Token
def initialize(t : String, xml : XML::Node?) def initialize(@type : String, xml : XML::Node?)
@type = ActionType.parse(t.capitalize) known_types = %w(token push pop combined bygroups include using usingself)
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
# Some actions may have actions in them, like this: # Some actions may have actions in them, like this:
# <bygroups> # <bygroups>
@ -47,18 +37,18 @@ module Tartrazine
end end
# Prefetch the attributes we ned from the XML and keep them # Prefetch the attributes we ned from the XML and keep them
case @type case type
when ActionType::Token when "token"
@token_type = xml["type"] @token_type = xml["type"]
when ActionType::Push when "push"
@states_to_push = xml.attributes.select { |attrib| @states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
}.map &.content }.map &.content
when ActionType::Pop when "pop"
@depth = xml["depth"].to_i @depth = xml["depth"].to_i
when ActionType::Using when "using"
@lexer_name = xml["lexer"].downcase @lexer_name = xml["lexer"].downcase
when ActionType::Combined when "combined"
@states = xml.attributes.select { |attrib| @states = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
}.map &.content }.map &.content
@ -67,11 +57,11 @@ module Tartrazine
# ameba:disable Metrics/CyclomaticComplexity # ameba:disable Metrics/CyclomaticComplexity
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token) def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
case @type case type
when ActionType::Token when "token"
raise Exception.new "Can't have a token without a match" if match.empty? raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))] [Token.new(type: @token_type, value: String.new(match[match_group].value))]
when ActionType::Push when "push"
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
to_push.each do |state| to_push.each do |state|
if state == "#pop" && lexer.state_stack.size > 1 if state == "#pop" && lexer.state_stack.size > 1
@ -83,11 +73,11 @@ module Tartrazine
end end
end end
[] of Token [] of Token
when ActionType::Pop when "pop"
to_pop = [@depth, lexer.state_stack.size - 1].min to_pop = [@depth, lexer.state_stack.size - 1].min
lexer.state_stack.pop(to_pop) lexer.state_stack.pop(to_pop)
[] of Token [] of Token
when ActionType::Bygroups when "bygroups"
# FIXME: handle # FIXME: handle
# ><bygroups> # ><bygroups>
# <token type="Punctuation"/> # <token type="Punctuation"/>
@ -112,17 +102,17 @@ module Tartrazine
result += e.emit(match, lexer, i + 1) result += e.emit(match, lexer, i + 1)
end end
result result
when ActionType::Using when "using"
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), secondary: true) Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when ActionType::Usingself when "usingself"
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
new_lexer = lexer.copy new_lexer = lexer.copy
new_lexer.tokenize(String.new(match[match_group].value), secondary: true) new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when ActionType::Combined when "combined"
# Combine two or more states into one anonymous state # Combine two states into one anonymous state
new_state = @states.map { |name| new_state = @states.map { |name|
lexer.states[name] lexer.states[name]
}.reduce { |state1, state2| }.reduce { |state1, state2|
@ -132,7 +122,7 @@ module Tartrazine
lexer.state_stack << new_state.name lexer.state_stack << new_state.name
[] of Token [] of Token
else else
raise Exception.new("Unknown action type: #{@type}") raise Exception.new("Unknown action type: #{type}")
end end
end end
end end

View File

@ -31,6 +31,7 @@ module BytesRegex
end end
def match(str : Bytes, pos = 0) : Array(Match) def match(str : Bytes, pos = 0) : Array(Match)
match = [] of Match
rc = LibPCRE2.match( rc = LibPCRE2.match(
@re, @re,
str, str,
@ -41,23 +42,22 @@ module BytesRegex
nil) nil)
if rc > 0 if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data) ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).map do |i| (0...rc).each do |i|
m_start = ovector[2 * i] m_start = ovector[2 * i]
m_end = ovector[2 * i + 1] m_size = ovector[2 * i + 1] - m_start
if m_start == m_end if m_size == 0
m_value = Bytes.new(0) m_value = Bytes.new(0)
else else
m_value = str[m_start...m_end] m_value = str[m_start...m_start + m_size]
end end
Match.new(m_value, m_start, m_end - m_start) match << Match.new(m_value, m_start, m_size)
end end
else
[] of Match
end end
match
end end
end end
struct Match class Match
property value : Bytes property value : Bytes
property start : UInt64 property start : UInt64
property size : UInt64 property size : UInt64

View File

@ -4,6 +4,7 @@ require "./constants/lexers"
module Tartrazine module Tartrazine
class LexerFiles class LexerFiles
extend BakedFileSystem extend BakedFileSystem
bake_folder "../lexers", __DIR__ bake_folder "../lexers", __DIR__
end end
@ -43,9 +44,12 @@ module Tartrazine
# For explanations on what actions and states do # For explanations on what actions and states do
# the Pygments documentation is a good place to start. # the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/ # https://pygments.org/docs/lexerdevelopment/
struct Lexer class Lexer
property config = { property config = {
name: "", name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0, priority: 0.0,
case_insensitive: false, case_insensitive: false,
dot_all: false, dot_all: false,
@ -60,49 +64,49 @@ module Tartrazine
new_lexer = Lexer.new new_lexer = Lexer.new
new_lexer.config = config new_lexer.config = config
new_lexer.states = states new_lexer.states = states
new_lexer.state_stack = ["root"] new_lexer.state_stack = state_stack[0..-1]
new_lexer new_lexer
end end
# Turn the text into a list of tokens. The `secondary` parameter # Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string # is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized. # from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text. # So, when it's true, we don't modify the text.
def tokenize(text : String, secondary = false) : Array(Token) def tokenize(text : String, usingself = false) : Array(Token)
@state_stack = ["root"] @state_stack = ["root"]
tokens = [] of Token tokens = [] of Token
pos = 0 pos = 0
matched = false matched = false
# Respect the `ensure_nl` config option # Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !secondary if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n" text += "\n"
end end
# We operate in bytes from now on
text_bytes = text.to_slice text_bytes = text.to_slice
# Loop through the text, matching rules # Loop through the text, applying rules
while pos < text_bytes.size while pos < text_bytes.size
states[@state_stack.last].rules.each do |rule| state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self) matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
if matched if matched
# Move position forward, save the tokens # Move position forward, save the tokens,
# tokenize from the new position
pos = new_pos pos = new_pos
tokens += new_tokens tokens += new_tokens
# Start matching rules at new position
break break
end end
end end
if !matched # If no rule matches, emit an error token
# at EOL, emit the newline, reset state to "root" unless matched
if text_bytes[pos] == 10u8 if text_bytes[pos] == 10u8
# at EOL, reset state to "root"
tokens << {type: "Text", value: "\n"} tokens << {type: "Text", value: "\n"}
@state_stack = ["root"] @state_stack = ["root"]
else else
# Emit an error token
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])} tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
end end
# Move forward 1
pos += 1 pos += 1
end end
end end
@ -166,6 +170,9 @@ module Tartrazine
if config if config
l.config = { l.config = {
name: xml_to_s(config, name) || "", name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0, priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true", not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true", dot_all: xml_to_s(config, dot_all) == "true",

View File

@ -19,7 +19,7 @@ module Tartrazine
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node) abstract def initialize(node : XML::Node)
@actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
def add_actions(node : XML::Node) def add_actions(node : XML::Node)
node.children.each do |child| node.children.each do |child|
@ -31,13 +31,14 @@ module Tartrazine
struct Rule < BaseRule struct Rule < BaseRule
property pattern : Regex = Regex.new "" property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos) match = pattern.match(text, pos)
# No match # No match
return false, pos, [] of Token if match.size == 0 return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, @actions.flat_map(&.emit(match, lexer)) return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)
@ -54,10 +55,11 @@ module Tartrazine
# This rule includes another state. If any of the rules of the # This rule includes another state. If any of the rules of the
# included state matches, this rule matches. # included state matches, this rule matches.
struct IncludeStateRule < BaseRule struct IncludeStateRule < BaseRule
@state : String = "" property state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
lexer.states[@state].rules.each do |rule| Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer) matched, new_pos, new_tokens = rule.match(text, pos, lexer)
return true, new_pos, new_tokens if matched return true, new_pos, new_tokens if matched
end end
@ -78,7 +80,7 @@ module Tartrazine
NO_MATCH = [] of Match NO_MATCH = [] of Match
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
return true, pos, @actions.flat_map(&.emit(NO_MATCH, lexer)) return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
end end
def initialize(node : XML::Node) def initialize(node : XML::Node)