mirror of
https://github.com/ralsina/tartrazine.git
synced 2025-06-08 12:40:25 -03:00
Compare commits
No commits in common. "788577b226bd707d240dd07bb2122cabca0e2f9e" and "cb09dff9f16261661e23d79639ec1ad2e140e854" have entirely different histories.
788577b226
...
cb09dff9f1
@ -8,29 +8,19 @@ require "./tartrazine"
|
|||||||
# perform a list of actions. These actions can emit tokens
|
# perform a list of actions. These actions can emit tokens
|
||||||
# or change the state machine.
|
# or change the state machine.
|
||||||
module Tartrazine
|
module Tartrazine
|
||||||
enum ActionType
|
|
||||||
Bygroups
|
|
||||||
Combined
|
|
||||||
Include
|
|
||||||
Pop
|
|
||||||
Push
|
|
||||||
Token
|
|
||||||
Using
|
|
||||||
Usingself
|
|
||||||
end
|
|
||||||
|
|
||||||
struct Action
|
struct Action
|
||||||
property actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
|
property type : String
|
||||||
|
|
||||||
@depth : Int32 = 0
|
@depth : Int32 = 0
|
||||||
@lexer_name : String = ""
|
@lexer_name : String = ""
|
||||||
@states : Array(String) = [] of String
|
@states : Array(String) = [] of String
|
||||||
@states_to_push : Array(String) = [] of String
|
@states_to_push : Array(String) = [] of String
|
||||||
@token_type : String = ""
|
@token_type : String = ""
|
||||||
@type : ActionType = ActionType::Token
|
|
||||||
|
|
||||||
def initialize(t : String, xml : XML::Node?)
|
def initialize(@type : String, xml : XML::Node?)
|
||||||
@type = ActionType.parse(t.capitalize)
|
known_types = %w(token push pop combined bygroups include using usingself)
|
||||||
|
raise Exception.new("Unknown action type: #{type}") unless known_types.includes? type
|
||||||
|
|
||||||
# Some actions may have actions in them, like this:
|
# Some actions may have actions in them, like this:
|
||||||
# <bygroups>
|
# <bygroups>
|
||||||
@ -47,18 +37,18 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
|
|
||||||
# Prefetch the attributes we ned from the XML and keep them
|
# Prefetch the attributes we ned from the XML and keep them
|
||||||
case @type
|
case type
|
||||||
when ActionType::Token
|
when "token"
|
||||||
@token_type = xml["type"]
|
@token_type = xml["type"]
|
||||||
when ActionType::Push
|
when "push"
|
||||||
@states_to_push = xml.attributes.select { |attrib|
|
@states_to_push = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content
|
}.map &.content
|
||||||
when ActionType::Pop
|
when "pop"
|
||||||
@depth = xml["depth"].to_i
|
@depth = xml["depth"].to_i
|
||||||
when ActionType::Using
|
when "using"
|
||||||
@lexer_name = xml["lexer"].downcase
|
@lexer_name = xml["lexer"].downcase
|
||||||
when ActionType::Combined
|
when "combined"
|
||||||
@states = xml.attributes.select { |attrib|
|
@states = xml.attributes.select { |attrib|
|
||||||
attrib.name == "state"
|
attrib.name == "state"
|
||||||
}.map &.content
|
}.map &.content
|
||||||
@ -67,11 +57,11 @@ module Tartrazine
|
|||||||
|
|
||||||
# ameba:disable Metrics/CyclomaticComplexity
|
# ameba:disable Metrics/CyclomaticComplexity
|
||||||
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
|
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
|
||||||
case @type
|
case type
|
||||||
when ActionType::Token
|
when "token"
|
||||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||||
when ActionType::Push
|
when "push"
|
||||||
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
|
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
|
||||||
to_push.each do |state|
|
to_push.each do |state|
|
||||||
if state == "#pop" && lexer.state_stack.size > 1
|
if state == "#pop" && lexer.state_stack.size > 1
|
||||||
@ -83,11 +73,11 @@ module Tartrazine
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
[] of Token
|
[] of Token
|
||||||
when ActionType::Pop
|
when "pop"
|
||||||
to_pop = [@depth, lexer.state_stack.size - 1].min
|
to_pop = [@depth, lexer.state_stack.size - 1].min
|
||||||
lexer.state_stack.pop(to_pop)
|
lexer.state_stack.pop(to_pop)
|
||||||
[] of Token
|
[] of Token
|
||||||
when ActionType::Bygroups
|
when "bygroups"
|
||||||
# FIXME: handle
|
# FIXME: handle
|
||||||
# ><bygroups>
|
# ><bygroups>
|
||||||
# <token type="Punctuation"/>
|
# <token type="Punctuation"/>
|
||||||
@ -112,17 +102,17 @@ module Tartrazine
|
|||||||
result += e.emit(match, lexer, i + 1)
|
result += e.emit(match, lexer, i + 1)
|
||||||
end
|
end
|
||||||
result
|
result
|
||||||
when ActionType::Using
|
when "using"
|
||||||
# Shunt to another lexer entirely
|
# Shunt to another lexer entirely
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), secondary: true)
|
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
|
||||||
when ActionType::Usingself
|
when "usingself"
|
||||||
# Shunt to another copy of this lexer
|
# Shunt to another copy of this lexer
|
||||||
return [] of Token if match.empty?
|
return [] of Token if match.empty?
|
||||||
new_lexer = lexer.copy
|
new_lexer = lexer.copy
|
||||||
new_lexer.tokenize(String.new(match[match_group].value), secondary: true)
|
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
|
||||||
when ActionType::Combined
|
when "combined"
|
||||||
# Combine two or more states into one anonymous state
|
# Combine two states into one anonymous state
|
||||||
new_state = @states.map { |name|
|
new_state = @states.map { |name|
|
||||||
lexer.states[name]
|
lexer.states[name]
|
||||||
}.reduce { |state1, state2|
|
}.reduce { |state1, state2|
|
||||||
@ -132,7 +122,7 @@ module Tartrazine
|
|||||||
lexer.state_stack << new_state.name
|
lexer.state_stack << new_state.name
|
||||||
[] of Token
|
[] of Token
|
||||||
else
|
else
|
||||||
raise Exception.new("Unknown action type: #{@type}")
|
raise Exception.new("Unknown action type: #{type}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
@ -31,6 +31,7 @@ module BytesRegex
|
|||||||
end
|
end
|
||||||
|
|
||||||
def match(str : Bytes, pos = 0) : Array(Match)
|
def match(str : Bytes, pos = 0) : Array(Match)
|
||||||
|
match = [] of Match
|
||||||
rc = LibPCRE2.match(
|
rc = LibPCRE2.match(
|
||||||
@re,
|
@re,
|
||||||
str,
|
str,
|
||||||
@ -41,23 +42,22 @@ module BytesRegex
|
|||||||
nil)
|
nil)
|
||||||
if rc > 0
|
if rc > 0
|
||||||
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
ovector = LibPCRE2.get_ovector_pointer(@match_data)
|
||||||
(0...rc).map do |i|
|
(0...rc).each do |i|
|
||||||
m_start = ovector[2 * i]
|
m_start = ovector[2 * i]
|
||||||
m_end = ovector[2 * i + 1]
|
m_size = ovector[2 * i + 1] - m_start
|
||||||
if m_start == m_end
|
if m_size == 0
|
||||||
m_value = Bytes.new(0)
|
m_value = Bytes.new(0)
|
||||||
else
|
else
|
||||||
m_value = str[m_start...m_end]
|
m_value = str[m_start...m_start + m_size]
|
||||||
end
|
end
|
||||||
Match.new(m_value, m_start, m_end - m_start)
|
match << Match.new(m_value, m_start, m_size)
|
||||||
end
|
end
|
||||||
else
|
|
||||||
[] of Match
|
|
||||||
end
|
end
|
||||||
|
match
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
struct Match
|
class Match
|
||||||
property value : Bytes
|
property value : Bytes
|
||||||
property start : UInt64
|
property start : UInt64
|
||||||
property size : UInt64
|
property size : UInt64
|
||||||
|
35
src/lexer.cr
35
src/lexer.cr
@ -4,6 +4,7 @@ require "./constants/lexers"
|
|||||||
module Tartrazine
|
module Tartrazine
|
||||||
class LexerFiles
|
class LexerFiles
|
||||||
extend BakedFileSystem
|
extend BakedFileSystem
|
||||||
|
|
||||||
bake_folder "../lexers", __DIR__
|
bake_folder "../lexers", __DIR__
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -43,9 +44,12 @@ module Tartrazine
|
|||||||
# For explanations on what actions and states do
|
# For explanations on what actions and states do
|
||||||
# the Pygments documentation is a good place to start.
|
# the Pygments documentation is a good place to start.
|
||||||
# https://pygments.org/docs/lexerdevelopment/
|
# https://pygments.org/docs/lexerdevelopment/
|
||||||
struct Lexer
|
class Lexer
|
||||||
property config = {
|
property config = {
|
||||||
name: "",
|
name: "",
|
||||||
|
aliases: [] of String,
|
||||||
|
filenames: [] of String,
|
||||||
|
mime_types: [] of String,
|
||||||
priority: 0.0,
|
priority: 0.0,
|
||||||
case_insensitive: false,
|
case_insensitive: false,
|
||||||
dot_all: false,
|
dot_all: false,
|
||||||
@ -60,49 +64,49 @@ module Tartrazine
|
|||||||
new_lexer = Lexer.new
|
new_lexer = Lexer.new
|
||||||
new_lexer.config = config
|
new_lexer.config = config
|
||||||
new_lexer.states = states
|
new_lexer.states = states
|
||||||
new_lexer.state_stack = ["root"]
|
new_lexer.state_stack = state_stack[0..-1]
|
||||||
new_lexer
|
new_lexer
|
||||||
end
|
end
|
||||||
|
|
||||||
# Turn the text into a list of tokens. The `secondary` parameter
|
# Turn the text into a list of tokens. The `usingself` parameter
|
||||||
# is true when the lexer is being used to tokenize a string
|
# is true when the lexer is being used to tokenize a string
|
||||||
# from a larger text that is already being tokenized.
|
# from a larger text that is already being tokenized.
|
||||||
# So, when it's true, we don't modify the text.
|
# So, when it's true, we don't modify the text.
|
||||||
def tokenize(text : String, secondary = false) : Array(Token)
|
def tokenize(text : String, usingself = false) : Array(Token)
|
||||||
@state_stack = ["root"]
|
@state_stack = ["root"]
|
||||||
tokens = [] of Token
|
tokens = [] of Token
|
||||||
pos = 0
|
pos = 0
|
||||||
matched = false
|
matched = false
|
||||||
|
|
||||||
# Respect the `ensure_nl` config option
|
# Respect the `ensure_nl` config option
|
||||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !secondary
|
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
|
||||||
text += "\n"
|
text += "\n"
|
||||||
end
|
end
|
||||||
|
|
||||||
# We operate in bytes from now on
|
|
||||||
text_bytes = text.to_slice
|
text_bytes = text.to_slice
|
||||||
# Loop through the text, matching rules
|
# Loop through the text, applying rules
|
||||||
while pos < text_bytes.size
|
while pos < text_bytes.size
|
||||||
states[@state_stack.last].rules.each do |rule|
|
state = states[@state_stack.last]
|
||||||
|
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
|
||||||
|
state.rules.each do |rule|
|
||||||
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
|
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
|
||||||
if matched
|
if matched
|
||||||
# Move position forward, save the tokens
|
# Move position forward, save the tokens,
|
||||||
|
# tokenize from the new position
|
||||||
pos = new_pos
|
pos = new_pos
|
||||||
tokens += new_tokens
|
tokens += new_tokens
|
||||||
# Start matching rules at new position
|
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
if !matched
|
# If no rule matches, emit an error token
|
||||||
# at EOL, emit the newline, reset state to "root"
|
unless matched
|
||||||
if text_bytes[pos] == 10u8
|
if text_bytes[pos] == 10u8
|
||||||
|
# at EOL, reset state to "root"
|
||||||
tokens << {type: "Text", value: "\n"}
|
tokens << {type: "Text", value: "\n"}
|
||||||
@state_stack = ["root"]
|
@state_stack = ["root"]
|
||||||
else
|
else
|
||||||
# Emit an error token
|
|
||||||
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
|
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
|
||||||
end
|
end
|
||||||
# Move forward 1
|
|
||||||
pos += 1
|
pos += 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@ -166,6 +170,9 @@ module Tartrazine
|
|||||||
if config
|
if config
|
||||||
l.config = {
|
l.config = {
|
||||||
name: xml_to_s(config, name) || "",
|
name: xml_to_s(config, name) || "",
|
||||||
|
aliases: xml_to_a(config, _alias) || [] of String,
|
||||||
|
filenames: xml_to_a(config, filename) || [] of String,
|
||||||
|
mime_types: xml_to_a(config, mime_type) || [] of String,
|
||||||
priority: xml_to_f(config, priority) || 0.0,
|
priority: xml_to_f(config, priority) || 0.0,
|
||||||
not_multiline: xml_to_s(config, not_multiline) == "true",
|
not_multiline: xml_to_s(config, not_multiline) == "true",
|
||||||
dot_all: xml_to_s(config, dot_all) == "true",
|
dot_all: xml_to_s(config, dot_all) == "true",
|
||||||
|
12
src/rules.cr
12
src/rules.cr
@ -19,7 +19,7 @@ module Tartrazine
|
|||||||
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
abstract def initialize(node : XML::Node)
|
abstract def initialize(node : XML::Node)
|
||||||
|
|
||||||
@actions : Array(Action) = [] of Action
|
property actions : Array(Action) = [] of Action
|
||||||
|
|
||||||
def add_actions(node : XML::Node)
|
def add_actions(node : XML::Node)
|
||||||
node.children.each do |child|
|
node.children.each do |child|
|
||||||
@ -31,13 +31,14 @@ module Tartrazine
|
|||||||
|
|
||||||
struct Rule < BaseRule
|
struct Rule < BaseRule
|
||||||
property pattern : Regex = Regex.new ""
|
property pattern : Regex = Regex.new ""
|
||||||
|
property actions : Array(Action) = [] of Action
|
||||||
|
|
||||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
match = pattern.match(text, pos)
|
match = pattern.match(text, pos)
|
||||||
|
|
||||||
# No match
|
# No match
|
||||||
return false, pos, [] of Token if match.size == 0
|
return false, pos, [] of Token if match.size == 0
|
||||||
return true, pos + match[0].size, @actions.flat_map(&.emit(match, lexer))
|
return true, pos + match[0].size, actions.flat_map { |action| action.emit(match, lexer) }
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node)
|
def initialize(node : XML::Node)
|
||||||
@ -54,10 +55,11 @@ module Tartrazine
|
|||||||
# This rule includes another state. If any of the rules of the
|
# This rule includes another state. If any of the rules of the
|
||||||
# included state matches, this rule matches.
|
# included state matches, this rule matches.
|
||||||
struct IncludeStateRule < BaseRule
|
struct IncludeStateRule < BaseRule
|
||||||
@state : String = ""
|
property state : String = ""
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
lexer.states[@state].rules.each do |rule|
|
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
|
||||||
|
lexer.states[state].rules.each do |rule|
|
||||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
||||||
return true, new_pos, new_tokens if matched
|
return true, new_pos, new_tokens if matched
|
||||||
end
|
end
|
||||||
@ -78,7 +80,7 @@ module Tartrazine
|
|||||||
NO_MATCH = [] of Match
|
NO_MATCH = [] of Match
|
||||||
|
|
||||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||||
return true, pos, @actions.flat_map(&.emit(NO_MATCH, lexer))
|
return true, pos, actions.flat_map { |action| action.emit(NO_MATCH, lexer) }
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(node : XML::Node)
|
def initialize(node : XML::Node)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user