mirror of
https://github.com/ralsina/tartrazine.git
synced 2024-11-10 05:22:23 +00:00
Tokenize via an iterator, makes everything much faster
This commit is contained in:
parent
788577b226
commit
7538fc76aa
@ -73,7 +73,8 @@ end
|
||||
# Helper that creates lexer and tokenizes
|
||||
def tokenize(lexer_name, text)
|
||||
lexer = Tartrazine.lexer(lexer_name)
|
||||
lexer.tokenize(text)
|
||||
tokenizer = Tartrazine::Tokenizer.new(lexer, text)
|
||||
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
|
||||
end
|
||||
|
||||
# Helper that tokenizes using chroma to validate the lexer
|
||||
|
@ -66,26 +66,26 @@ module Tartrazine
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
|
||||
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
|
||||
case @type
|
||||
when ActionType::Token
|
||||
raise Exception.new "Can't have a token without a match" if match.empty?
|
||||
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
|
||||
when ActionType::Push
|
||||
to_push = @states_to_push.empty? ? [lexer.state_stack.last] : @states_to_push
|
||||
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
|
||||
to_push.each do |state|
|
||||
if state == "#pop" && lexer.state_stack.size > 1
|
||||
if state == "#pop" && tokenizer.state_stack.size > 1
|
||||
# Pop the state
|
||||
lexer.state_stack.pop
|
||||
tokenizer.state_stack.pop
|
||||
else
|
||||
# Really push
|
||||
lexer.state_stack << state
|
||||
tokenizer.state_stack << state
|
||||
end
|
||||
end
|
||||
[] of Token
|
||||
when ActionType::Pop
|
||||
to_pop = [@depth, lexer.state_stack.size - 1].min
|
||||
lexer.state_stack.pop(to_pop)
|
||||
to_pop = [@depth, tokenizer.state_stack.size - 1].min
|
||||
tokenizer.state_stack.pop(to_pop)
|
||||
[] of Token
|
||||
when ActionType::Bygroups
|
||||
# FIXME: handle
|
||||
@ -109,27 +109,32 @@ module Tartrazine
|
||||
# No match for this group
|
||||
next
|
||||
end
|
||||
result += e.emit(match, lexer, i + 1)
|
||||
result += e.emit(match, tokenizer, i + 1)
|
||||
end
|
||||
result
|
||||
when ActionType::Using
|
||||
# Shunt to another lexer entirely
|
||||
return [] of Token if match.empty?
|
||||
Tartrazine.lexer(@lexer_name).tokenize(String.new(match[match_group].value), secondary: true)
|
||||
Tokenizer.new(
|
||||
Tartrazine.lexer(@lexer_name),
|
||||
String.new(match[match_group].value),
|
||||
secondary: true).to_a
|
||||
when ActionType::Usingself
|
||||
# Shunt to another copy of this lexer
|
||||
return [] of Token if match.empty?
|
||||
new_lexer = lexer.copy
|
||||
new_lexer.tokenize(String.new(match[match_group].value), secondary: true)
|
||||
Tokenizer.new(
|
||||
tokenizer.lexer,
|
||||
String.new(match[match_group].value),
|
||||
secondary: true).to_a
|
||||
when ActionType::Combined
|
||||
# Combine two or more states into one anonymous state
|
||||
new_state = @states.map { |name|
|
||||
lexer.states[name]
|
||||
tokenizer.lexer.states[name]
|
||||
}.reduce { |state1, state2|
|
||||
state1 + state2
|
||||
}
|
||||
lexer.states[new_state.name] = new_state
|
||||
lexer.state_stack << new_state.name
|
||||
tokenizer.lexer.states[new_state.name] = new_state
|
||||
tokenizer.state_stack << new_state.name
|
||||
[] of Token
|
||||
else
|
||||
raise Exception.new("Unknown action type: #{@type}")
|
||||
|
@ -7,19 +7,39 @@ module Tartrazine
|
||||
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
|
||||
end
|
||||
|
||||
private def line_label(i : Int32) : String
|
||||
"#{i + 1}".rjust(4).ljust(5)
|
||||
end
|
||||
|
||||
def format(text : String, lexer : Lexer) : String
|
||||
tokenizer = Tokenizer.new(lexer, text)
|
||||
i = 0
|
||||
output = String.build do |outp|
|
||||
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
|
||||
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
outp << label
|
||||
line.each do |token|
|
||||
outp << line_label(i) if line_numbers?
|
||||
tokenizer.each do |token|
|
||||
outp << colorize(token[:value], token[:type])
|
||||
if token[:value].includes?("\n")
|
||||
i += 1
|
||||
outp << line_label(i) if line_numbers?
|
||||
end
|
||||
end
|
||||
end
|
||||
output
|
||||
end
|
||||
|
||||
# def format(text : String, lexer : Lexer) : String
|
||||
# output = String.build do |outp|
|
||||
# lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
|
||||
# label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
# outp << label
|
||||
# line.each do |token|
|
||||
# outp << colorize(token[:value], token[:type])
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# output
|
||||
# end
|
||||
|
||||
def colorize(text : String, token : String) : String
|
||||
style = theme.styles.fetch(token, nil)
|
||||
return text if style.nil?
|
||||
|
@ -54,21 +54,29 @@ module Tartrazine
|
||||
output
|
||||
end
|
||||
|
||||
private def line_label(i : Int32) : String
|
||||
line_label = "#{i + 1}".rjust(4).ljust(5)
|
||||
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||
end
|
||||
|
||||
def format_text(text : String, lexer : Lexer) : String
|
||||
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
|
||||
# lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
|
||||
tokenizer = Tokenizer.new(lexer, text)
|
||||
i = 0
|
||||
output = String.build do |outp|
|
||||
if surrounding_pre?
|
||||
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
|
||||
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
|
||||
end
|
||||
outp << "<code class=\"#{get_css_class("Background")}\">"
|
||||
lines.each_with_index(offset: line_number_start - 1) do |line, i|
|
||||
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
|
||||
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
|
||||
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
|
||||
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
|
||||
line.each do |token|
|
||||
outp << line_label(i) if line_numbers?
|
||||
tokenizer.each do |token|
|
||||
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
|
||||
if token[:value].ends_with? "\n"
|
||||
i += 1
|
||||
outp << line_label(i) if line_numbers?
|
||||
end
|
||||
end
|
||||
outp << "</code></pre>"
|
||||
|
152
src/lexer.cr
152
src/lexer.cr
@ -37,6 +37,75 @@ module Tartrazine
|
||||
LEXERS_BY_NAME.keys.sort!
|
||||
end
|
||||
|
||||
# A token, the output of the tokenizer
|
||||
alias Token = NamedTuple(type: String, value: String)
|
||||
|
||||
struct Tokenizer
|
||||
include Iterator(Token)
|
||||
property lexer : Lexer
|
||||
property text : Bytes
|
||||
property pos : Int32 = 0
|
||||
@dq = Deque(Token).new
|
||||
property state_stack = ["root"]
|
||||
|
||||
def initialize(@lexer : Lexer, text : String, secondary = false)
|
||||
# Respect the `ensure_nl` config option
|
||||
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
|
||||
text += "\n"
|
||||
end
|
||||
@text = text.to_slice
|
||||
end
|
||||
|
||||
def next : Iterator::Stop | Token
|
||||
if @dq.size > 0
|
||||
return @dq.shift
|
||||
end
|
||||
if pos == @text.size
|
||||
return stop
|
||||
end
|
||||
|
||||
matched = false
|
||||
while @pos < @text.size
|
||||
@lexer.states[@state_stack.last].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
|
||||
if matched
|
||||
@pos = new_pos
|
||||
split_tokens(new_tokens).each { |token| @dq << token }
|
||||
break
|
||||
end
|
||||
end
|
||||
if !matched
|
||||
if @text[@pos] == 10u8
|
||||
@dq << {type: "Text", value: "\n"}
|
||||
@state_stack = ["root"]
|
||||
else
|
||||
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
|
||||
@pos += 1
|
||||
end
|
||||
break
|
||||
end
|
||||
end
|
||||
self.next
|
||||
end
|
||||
|
||||
# If a token contains a newline, split it into two tokens
|
||||
def split_tokens(tokens : Array(Token)) : Array(Token)
|
||||
split_tokens = [] of Token
|
||||
tokens.each do |token|
|
||||
if token[:value].includes?("\n")
|
||||
values = token[:value].split("\n")
|
||||
values.each_with_index do |value, index|
|
||||
value += "\n" if index < values.size - 1
|
||||
split_tokens << {type: token[:type], value: value}
|
||||
end
|
||||
else
|
||||
split_tokens << token
|
||||
end
|
||||
end
|
||||
split_tokens
|
||||
end
|
||||
end
|
||||
|
||||
# This implements a lexer for Pygments RegexLexers as expressed
|
||||
# in Chroma's XML serialization.
|
||||
#
|
||||
@ -52,62 +121,7 @@ module Tartrazine
|
||||
not_multiline: false,
|
||||
ensure_nl: false,
|
||||
}
|
||||
# property xml : String = ""
|
||||
property states = {} of String => State
|
||||
property state_stack = ["root"]
|
||||
|
||||
def copy : Lexer
|
||||
new_lexer = Lexer.new
|
||||
new_lexer.config = config
|
||||
new_lexer.states = states
|
||||
new_lexer.state_stack = ["root"]
|
||||
new_lexer
|
||||
end
|
||||
|
||||
# Turn the text into a list of tokens. The `secondary` parameter
|
||||
# is true when the lexer is being used to tokenize a string
|
||||
# from a larger text that is already being tokenized.
|
||||
# So, when it's true, we don't modify the text.
|
||||
def tokenize(text : String, secondary = false) : Array(Token)
|
||||
@state_stack = ["root"]
|
||||
tokens = [] of Token
|
||||
pos = 0
|
||||
matched = false
|
||||
|
||||
# Respect the `ensure_nl` config option
|
||||
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !secondary
|
||||
text += "\n"
|
||||
end
|
||||
|
||||
# We operate in bytes from now on
|
||||
text_bytes = text.to_slice
|
||||
# Loop through the text, matching rules
|
||||
while pos < text_bytes.size
|
||||
states[@state_stack.last].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
|
||||
if matched
|
||||
# Move position forward, save the tokens
|
||||
pos = new_pos
|
||||
tokens += new_tokens
|
||||
# Start matching rules at new position
|
||||
break
|
||||
end
|
||||
end
|
||||
if !matched
|
||||
# at EOL, emit the newline, reset state to "root"
|
||||
if text_bytes[pos] == 10u8
|
||||
tokens << {type: "Text", value: "\n"}
|
||||
@state_stack = ["root"]
|
||||
else
|
||||
# Emit an error token
|
||||
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
|
||||
end
|
||||
# Move forward 1
|
||||
pos += 1
|
||||
end
|
||||
end
|
||||
Lexer.collapse_tokens(tokens)
|
||||
end
|
||||
|
||||
# Collapse consecutive tokens of the same type for easier comparison
|
||||
# and smaller output
|
||||
@ -131,31 +145,6 @@ module Tartrazine
|
||||
result
|
||||
end
|
||||
|
||||
# Group tokens into lines, splitting them when a newline is found
|
||||
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
|
||||
split_tokens = [] of Token
|
||||
tokens.each do |token|
|
||||
if token[:value].includes?("\n")
|
||||
values = token[:value].split("\n")
|
||||
values.each_with_index do |value, index|
|
||||
value += "\n" if index < values.size - 1
|
||||
split_tokens << {type: token[:type], value: value}
|
||||
end
|
||||
else
|
||||
split_tokens << token
|
||||
end
|
||||
end
|
||||
lines = [Array(Token).new]
|
||||
split_tokens.each do |token|
|
||||
lines.last << token
|
||||
if token[:value].includes?("\n")
|
||||
lines << Array(Token).new
|
||||
end
|
||||
end
|
||||
lines
|
||||
end
|
||||
|
||||
# ameba:disable Metrics/CyclomaticComplexity
|
||||
def self.from_xml(xml : String) : Lexer
|
||||
l = Lexer.new
|
||||
lexer = XML.parse(xml).first_element_child
|
||||
@ -229,7 +218,4 @@ module Tartrazine
|
||||
new_state
|
||||
end
|
||||
end
|
||||
|
||||
# A token, the output of the tokenizer
|
||||
alias Token = NamedTuple(type: String, value: String)
|
||||
end
|
||||
|
16
src/rules.cr
16
src/rules.cr
@ -16,7 +16,7 @@ module Tartrazine
|
||||
alias MatchData = Array(Match)
|
||||
|
||||
abstract struct BaseRule
|
||||
abstract def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
abstract def initialize(node : XML::Node)
|
||||
|
||||
@actions : Array(Action) = [] of Action
|
||||
@ -32,12 +32,12 @@ module Tartrazine
|
||||
struct Rule < BaseRule
|
||||
property pattern : Regex = Regex.new ""
|
||||
|
||||
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
match = pattern.match(text, pos)
|
||||
|
||||
# No match
|
||||
return false, pos, [] of Token if match.size == 0
|
||||
return true, pos + match[0].size, @actions.flat_map(&.emit(match, lexer))
|
||||
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
@ -56,9 +56,9 @@ module Tartrazine
|
||||
struct IncludeStateRule < BaseRule
|
||||
@state : String = ""
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
lexer.states[@state].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
|
||||
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
tokenizer.@lexer.states[@state].rules.each do |rule|
|
||||
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
|
||||
return true, new_pos, new_tokens if matched
|
||||
end
|
||||
return false, pos, [] of Token
|
||||
@ -77,8 +77,8 @@ module Tartrazine
|
||||
struct UnconditionalRule < BaseRule
|
||||
NO_MATCH = [] of Match
|
||||
|
||||
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
|
||||
return true, pos, @actions.flat_map(&.emit(NO_MATCH, lexer))
|
||||
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
|
||||
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
|
||||
end
|
||||
|
||||
def initialize(node : XML::Node)
|
||||
|
@ -9,7 +9,7 @@ require "xml"
|
||||
module Tartrazine
|
||||
alias Color = Sixteen::Color
|
||||
|
||||
class ThemeFiles
|
||||
struct ThemeFiles
|
||||
extend BakedFileSystem
|
||||
bake_folder "../styles", __DIR__
|
||||
end
|
||||
@ -39,7 +39,7 @@ module Tartrazine
|
||||
themes.to_a.sort!
|
||||
end
|
||||
|
||||
class Style
|
||||
struct Style
|
||||
# These properties are tri-state.
|
||||
# true means it's set
|
||||
# false means it's not set
|
||||
@ -79,7 +79,7 @@ module Tartrazine
|
||||
end
|
||||
end
|
||||
|
||||
class Theme
|
||||
struct Theme
|
||||
property name : String = ""
|
||||
|
||||
property styles = {} of String => Style
|
||||
|
Loading…
Reference in New Issue
Block a user