Rebase to main

This commit is contained in:
Roberto Alsina 2024-08-24 19:59:05 -03:00
parent 5a88a51f3e
commit 58fd42d936
5 changed files with 63 additions and 88 deletions

View File

@ -9,4 +9,7 @@
* ✅ Implement lexer loader by file extension * ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter * ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type * Implement lexer loader by mime type
* Implement Pygment's "DelegateLexer" * ✅ Implement Delegating lexers
* ✅ Add RstLexer
* Add Mako template lexer
* Implement heuristic lexer detection

View File

@ -23,14 +23,17 @@ module Tartrazine
struct Action struct Action
property actions : Array(Action) = [] of Action property actions : Array(Action) = [] of Action
property token_type : String = "" @content_index : Array(Int32) = [] of Int32
property states_to_push : Array(String) = [] of String @depth : Int32 = 0
property depth = 0 @lexer_index : Int32 = 0
property lexer_name : String = "" @lexer_name : String = ""
property states_to_combine : Array(String) = [] of String @states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
@type : ActionType = ActionType::Token
def initialize(@type : String, @xml : XML::Node?) def initialize(t : String, xml : XML::Node?)
# Extract information from the XML node we will use later @type = ActionType.parse(t.capitalize)
# Some actions may have actions in them, like this: # Some actions may have actions in them, like this:
# <bygroups> # <bygroups>
@ -41,31 +44,30 @@ module Tartrazine
# #
# The token actions match with the first 2 groups in the regex # The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer # the using action matches the 3rd and shunts it to another lexer
xml.children.each do |node|
known_types = %w(token push pop bygroups using usingself include combined)
raise Exception.new(
"Unknown action type: #{@type}") unless known_types.includes? @type
@xml.children.each do |node|
next unless node.element? next unless node.element?
@actions << Action.new(node.name, node) @actions << Action.new(node.name, node)
end end
# Prefetch the attributes we ned from the XML and keep them
case @type case @type
when "token" when ActionType::Token
@token_type = xml["type"]? || "" @token_type = xml["type"]
when "push" when ActionType::Push
@states_to_push = xml.attributes.select { |attrib| @states_to_push = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
}.map &.content || [] of String }.map &.content
when "pop" when ActionType::Pop
@depth = xml["depth"]?.try &.to_i || 0 @depth = xml["depth"].to_i
when "using" when ActionType::Using
@lexer_name = xml["lexer"]?.try &.downcase || "" @lexer_name = xml["lexer"].downcase
when "combined" when ActionType::Combined
@states_to_combine = xml.attributes.select { |attrib| @states = xml.attributes.select { |attrib|
attrib.name == "state" attrib.name == "state"
}.map &.content }.map &.content
when ActionType::Usingbygroup
@lexer_index = xml["lexer"].to_i
@content_index = xml["content"].split(",").map(&.to_i)
end end
end end
@ -75,25 +77,21 @@ module Tartrazine
when ActionType::Token when ActionType::Token
raise Exception.new "Can't have a token without a match" if match.empty? raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))] [Token.new(type: @token_type, value: String.new(match[match_group].value))]
when "push" when ActionType::Push
if @states_to_push.empty? to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
# Push without a state means push the current state to_push.each do |state|
@states_to_push = [lexer.state_stack.last] if state == "#pop" && tokenizer.state_stack.size > 1
end
@states_to_push.each do |state|
if state == "#pop"
# Pop the state # Pop the state
lexer.state_stack.pop tokenizer.state_stack.pop
else else
# Really push # Really push
lexer.state_stack << state tokenizer.state_stack << state
end end
end end
[] of Token [] of Token
when "pop" when ActionType::Pop
if lexer.state_stack.size > @depth to_pop = [@depth, tokenizer.state_stack.size - 1].min
lexer.state_stack.pop(@depth) tokenizer.state_stack.pop(to_pop)
end
[] of Token [] of Token
when ActionType::Bygroups when ActionType::Bygroups
# FIXME: handle # FIXME: handle
@ -104,7 +102,7 @@ module Tartrazine
# #
# where that None means skipping a group # where that None means skipping a group
# #
raise Exception.new "Can't have a token without a match" if match.empty? raise Exception.new "Can't have a token without a match" if match.nil?
# Each group matches an action. If the group match is empty, # Each group matches an action. If the group match is empty,
# the action is skipped. # the action is skipped.
@ -113,7 +111,8 @@ module Tartrazine
begin begin
next if match[i + 1].size == 0 next if match[i + 1].size == 0
rescue IndexError rescue IndexError
# No match for the last group # FIXME: This should not actually happen
# No match for this group
next next
end end
result += e.emit(match, tokenizer, i + 1) result += e.emit(match, tokenizer, i + 1)
@ -122,16 +121,19 @@ module Tartrazine
when ActionType::Using when ActionType::Using
# Shunt to another lexer entirely # Shunt to another lexer entirely
return [] of Token if match.empty? return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), usingself: true) Tartrazine.lexer(@lexer_name).tokenizer(
when "usingself" String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
# Shunt to another copy of this lexer # Shunt to another copy of this lexer
return [] of Token if match.empty? return [] of Token if match.empty?
new_lexer = Lexer.from_xml(lexer.xml) tokenizer.lexer.tokenizer(
new_lexer.tokenize(String.new(match[match_group].value), usingself: true) String.new(match[match_group].value),
when "combined" secondary: true).to_a
# Combine two states into one anonymous state when ActionType::Combined
new_state = @states_to_combine.map { |name| # Combine two or more states into one anonymous state
lexer.states[name] new_state = @states.map { |name|
tokenizer.lexer.states[name]
}.reduce { |state1, state2| }.reduce { |state1, state2|
state1 + state2 state1 + state2
} }
@ -149,7 +151,7 @@ module Tartrazine
content, content,
secondary: true).to_a secondary: true).to_a
else else
raise Exception.new("Unhandled action type: #{type}") raise Exception.new("Unknown action type: #{@type}")
end end
end end
end end

View File

@ -31,7 +31,6 @@ module BytesRegex
end end
def match(str : Bytes, pos = 0) : Array(Match) def match(str : Bytes, pos = 0) : Array(Match)
match = [] of Match
rc = LibPCRE2.match( rc = LibPCRE2.match(
@re, @re,
str, str,
@ -40,9 +39,9 @@ module BytesRegex
LibPCRE2::NO_UTF_CHECK, LibPCRE2::NO_UTF_CHECK,
@match_data, @match_data,
nil) nil)
if rc >= 0 if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data) ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).each do |i| (0...rc).map do |i|
m_start = ovector[2 * i] m_start = ovector[2 * i]
m_end = ovector[2 * i + 1] m_end = ovector[2 * i + 1]
if m_start == m_end if m_start == m_end
@ -55,7 +54,6 @@ module BytesRegex
else else
[] of Match [] of Match
end end
match
end end
end end

View File

@ -1,18 +1,6 @@
require "docopt" require "docopt"
require "./**" require "./**"
# Performance data (in milliseconds):
#
# Docopt parsing: 0.5
# Instantiating a theme: 0.1
# Instantiating a formatter: 1.0
# Instantiating a lexer: 2.0
# Tokenizing crycco.cr: 16.0
# Formatting: 0.5
# I/O: 1.5
# ---------------------------------
# Total: 21.6
HELP = <<-HELP HELP = <<-HELP
tartrazine: a syntax highlighting tool tartrazine: a syntax highlighting tool
@ -96,6 +84,7 @@ if options["-f"]
end end
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String)) lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end input = File.open(options["FILE"].as(String)).gets_to_end
if options["-o"].nil? if options["-o"].nil?

View File

@ -15,28 +15,11 @@ module Tartrazine
alias Match = BytesRegex::Match alias Match = BytesRegex::Match
alias MatchData = Array(Match) alias MatchData = Array(Match)
class Rule abstract struct BaseRule
property pattern : Regex = Regex.new "" abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
property actions : Array(Action) = [] of Action abstract def initialize(node : XML::Node)
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token)) @actions : Array(Action) = [] of Action
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.empty? || match[0].size == 0
tokens = [] of Token
actions.each do |action|
tokens += action.emit(match, lexer)
end
return true, pos + match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
def add_actions(node : XML::Node) def add_actions(node : XML::Node)
node.children.each do |child| node.children.each do |child|
@ -73,9 +56,9 @@ module Tartrazine
struct IncludeStateRule < BaseRule struct IncludeStateRule < BaseRule
@state : String = "" @state : String = ""
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token)) def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
lexer.states[state].rules.each do |rule| tokenizer.@lexer.states[@state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer) matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
return true, new_pos, new_tokens if matched return true, new_pos, new_tokens if matched
end end
return false, pos, [] of Token return false, pos, [] of Token