23 Commits

Author SHA1 Message Date
27008640a6 v0.4.0 2024-08-14 13:25:39 -03:00
7db8fdc9e4 Updated README 2024-08-14 13:25:20 -03:00
ad664d9f93 Added error handling 2024-08-14 11:24:25 -03:00
0626c8619f Working bytes-regexes, faster, MORE tests pass 2024-08-14 11:06:53 -03:00
3725201f8a Merge branch 'main' of github.com:ralsina/tartrazine 2024-08-14 09:25:08 -03:00
6f64b76c44 lint 2024-08-13 22:07:23 -03:00
5218af6855 lint 2024-08-13 22:06:19 -03:00
c898f395a1 reset stack on EOL instead of error, makes no difference, but it's in pygments version 2024-08-13 22:06:07 -03:00
56e49328fb Tiny bug 2024-08-13 21:00:00 -03:00
8d7faf2098 0.3.0 2024-08-13 11:06:06 -03:00
2e87762f1b API changes to make it nicer
These are incompatible, tho.

* Theme is now a property of the formatter instead
  of passing it arounf
* get_style_defs is now style_defs
2024-08-13 10:57:02 -03:00
88f5674917 Tiny bug 2024-08-12 21:02:17 -03:00
ce6f3d29b5 Remove Re2 hack 2024-08-12 19:01:13 -03:00
46d6d3f467 Make how-heavy-is-bold configurable 2024-08-12 10:55:58 -03:00
78ddc69937 Merge branch 'main' of github.com:ralsina/tartrazine 2024-08-12 10:11:03 -03:00
b1ad7b64c0 oops 2024-08-12 10:10:51 -03:00
cbedf8a8db Bump to 0.2.0 2024-08-11 13:24:30 -03:00
ec8c53c823 Added --line-numbers for the terminal formatter 2024-08-11 13:21:47 -03:00
e3a1ce37b4 Support guessing lexer by filename 2024-08-11 13:04:35 -03:00
b4f38e00e1 Script to generate lexer metadata constants 2024-08-11 12:41:22 -03:00
08daabe1c3 Cleanup token abbreviation generation script 2024-08-11 12:06:02 -03:00
e8d405fc99 Implemented decent version of the CLI 2024-08-11 11:54:00 -03:00
e295256573 Implemented decent version of the CLI 2024-08-11 11:49:42 -03:00
20 changed files with 1699 additions and 157 deletions

View File

@@ -1,5 +1,5 @@
# This configuration file was generated by `ameba --gen-config`
# on 2024-08-04 23:09:09 UTC using Ameba version 1.6.1.
# on 2024-08-12 22:00:49 UTC using Ameba version 1.6.1.
# The point is for the user to remove these configuration records
# one by one as the reported problems are removed from the code base.
@@ -9,7 +9,7 @@ Documentation/DocumentationAdmonition:
Description: Reports documentation admonitions
Timezone: UTC
Excluded:
- src/tartrazine.cr
- src/lexer.cr
- src/actions.cr
Admonitions:
- TODO
@@ -17,3 +17,105 @@ Documentation/DocumentationAdmonition:
- BUG
Enabled: true
Severity: Warning
# Problems found: 22
# Run `ameba --only Lint/MissingBlockArgument` for details
Lint/MissingBlockArgument:
Description: Disallows yielding method definitions without block argument
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/NotNil` for details
Lint/NotNil:
Description: Identifies usage of `not_nil!` calls
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 34
# Run `ameba --only Lint/ShadowingOuterLocalVar` for details
Lint/ShadowingOuterLocalVar:
Description: Disallows the usage of the same name as outer local variables for block
or proc arguments
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/UnreachableCode` for details
Lint/UnreachableCode:
Description: Reports unreachable code
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 6
# Run `ameba --only Lint/UselessAssign` for details
Lint/UselessAssign:
Description: Disallows useless variable assignments
ExcludeTypeDeclarations: false
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 3
# Run `ameba --only Naming/BlockParameterName` for details
Naming/BlockParameterName:
Description: Disallows non-descriptive block parameter names
MinNameLength: 3
AllowNamesEndingInNumbers: true
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- _
- e
- i
- j
- k
- v
- x
- y
- ex
- io
- ws
- op
- tx
- id
- ip
- k1
- k2
- v1
- v2
ForbiddenNames: []
Enabled: true
Severity: Convention
# Problems found: 1
# Run `ameba --only Naming/RescuedExceptionsVariableName` for details
Naming/RescuedExceptionsVariableName:
Description: Makes sure that rescued exceptions variables are named as expected
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- e
- ex
- exception
- error
Enabled: true
Severity: Convention
# Problems found: 6
# Run `ameba --only Naming/TypeNames` for details
Naming/TypeNames:
Description: Enforces type names in camelcase manner
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Convention

View File

@@ -1,5 +1,5 @@
build: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build -Dstrict_multi_assign -Dno_number_autocast
shards build -Dstrict_multi_assign -Dno_number_autocast -d --error-trace
release: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release
static: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml

View File

@@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of.
It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
The CLI tool can be used to highlight many things in many styles.
# A port of what? Why "kind of"?
Because I did not read the Pygments code. And this is actually
based on [Chroma](https://github.com/alecthomas/chroma) ...
although I did not read that code either.
Pygments is a staple of the Python ecosystem, and it's great.
It lets you highlight code in many languages, and it has many
themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma
@@ -31,12 +31,21 @@ is a subset of Pygments'.
Currently Tartrazine supports ... 241 languages.
It has 332 themes (64 from Chroma, the rest are base16 themes via
It has 331 themes (63 from Chroma, the rest are base16 themes via
[Sixteen](https://github.com/ralsina/sixteen)
## Installation
This has a CLI but it's not generally usable.
From prebuilt binaries:
Each release provides statically-linked binaries that should
work on any Linux. Get them from the [releases page](https://github.com/ralsina/tartrazine/releases) and put them in your PATH.
To build from source:
1. Clone this repo
2. Run `make` to build the `tartrazine` binary
3. Copy the binary somewhere in your PATH.
## Usage
@@ -60,4 +69,4 @@ puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)
## Contributors
- [Roberto Alsina](https://github.com/ralsina) - creator and maintainer
- [Roberto Alsina](https://github.com/ralsina) - creator and maintainer

10
TODO.md
View File

@@ -2,6 +2,10 @@
## TODO
* Implement styles
* Implement formatters
* Implement lexer loader that respects aliases, etc
* Implement styles
* Implement formatters
* Implement CLI
* ✅ Implement lexer loader that respects aliases
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type

54
scripts/lexer_metadata.py Normal file
View File

@@ -0,0 +1,54 @@
# This script parses the metadata of all the lexers and generates
# a datafile with all the information so we don't have to instantiate
# all the lexers to get the information.
import glob
from collections import defaultdict
lexer_by_name = {}
lexer_by_mimetype = defaultdict(set)
lexer_by_filename = defaultdict(set)
for fname in glob.glob("lexers/*.xml"):
aliases = set([])
mimetypes = set([])
filenames = set([])
print(fname)
with open(fname) as f:
lexer_name = fname.split("/")[-1].split(".")[0]
for line in f:
if "</config" in line:
break
if "<filename>" in line:
filenames.add(line.split(">")[1].split("<")[0].lower())
if "<mime_type>" in line:
mimetypes.add(line.split(">")[1].split("<")[0].lower())
if "<alias>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
if "<name>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
for alias in aliases:
if alias in lexer_by_name and alias != lexer_by_name[alias]:
raise Exception(f"Alias {alias} already in use by {lexer_by_name[alias]}")
lexer_by_name[alias] = lexer_name
for mimetype in mimetypes:
lexer_by_mimetype[mimetype] = lexer_name
for filename in filenames:
lexer_by_filename[filename].add(lexer_name)
with open("src/constants/lexers.cr", "w") as f:
f.write("module Tartrazine\n")
f.write(" LEXERS_BY_NAME = {\n")
for k, v in lexer_by_name.items():
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_MIMETYPE = {\n")
for k, v in lexer_by_mimetype.items():
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_FILENAME = {\n")
for k, v in lexer_by_filename.items():
f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n')
f.write("}\n")
f.write("end\n")

View File

@@ -1,3 +1,10 @@
# Script to generate abbreviations for tokens. Parses all lexers
# and styles files to find all token names and generate a unique
# abbreviation for each one. The abbreviations are generated by
# taking the uppercase letters of the token name and converting
# them to lowercase. If the abbreviation is not unique, the script
# will print a warning and exit.
import sys
import string
import glob
@@ -40,7 +47,9 @@ for fname in glob.glob("styles/*.xml"):
tokens.add(line)
check_abbrevs()
print("Abbreviations = {")
for k, v in abbrevs.items():
print(f' "{k}" => "{v}",')
print("}")
with open ("src/constants/token_abbrevs.cr", "w") as outf:
outf.write("module Tartrazine\n")
outf.write(" Abbreviations = {\n")
for k in sorted(abbrevs.keys()):
outf.write(f' "{k}" => "{abbrevs[k]}",\n')
outf.write(" }\nend\n")

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.1.1
version: 0.4.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>

View File

@@ -14,15 +14,18 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
}
# These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct
bad_in_chroma = {
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
"#{__DIR__}/tests/html/javascript_backtracking.txt",
"#{__DIR__}/tests/java/test_default.txt",
"#{__DIR__}/tests/java/test_multiline_string.txt",
"#{__DIR__}/tests/java/test_numeric_literals.txt",
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
}
@@ -30,19 +33,14 @@ bad_in_chroma = {
known_bad = {
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
"#{__DIR__}/tests/html/css_backtracking.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/mcfunction/data.txt",
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
}
# Tests that fail because of a limitation in PCRE2

View File

@@ -1,5 +1,4 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
@@ -31,11 +30,11 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : Regex::MatchData?, lexer : Lexer, match_group = 0) : Array(Token)
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil?
[Token.new(type: xml["type"], value: match[match_group])]
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: xml["type"], value: String.new(match[match_group].value))]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@@ -80,23 +79,29 @@ module Tartrazine
# the action is skipped.
result = [] of Token
@actions.each_with_index do |e, i|
next if match[i + 1]?.nil?
begin
next if match[i + 1].size == 0
rescue IndexError
# FIXME: This should not actually happen
# No match for this group
next
end
result += e.emit(match, lexer, i + 1)
end
result
when "using"
# Shunt to another lexer entirely
return [] of Token if match.nil?
return [] of Token if match.empty?
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(match[match_group], usingself: true)
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.nil?
return [] of Token if match.empty?
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(match[match_group], usingself: true)
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

75
src/bytes_regex.cr Normal file
View File

@@ -0,0 +1,75 @@
module BytesRegex
extend self
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
if @re = LibPCRE2.compile(
pattern,
pattern.bytesize,
flags,
out errorcode,
out erroroffset,
nil)
else
msg = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
end
def finalize
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
LibPCRE2::NO_UTF_CHECK,
match_data,
nil)
if rc < 0
# No match, do nothing
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
(0...rc).each do |i|
m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start
if m_size == 0
m_value = Bytes.new(0)
else
m_value = str[m_start...m_start + m_size]
end
match << Match.new(m_value, m_start, m_size)
end
end
LibPCRE2.match_data_free(match_data)
match
end
end
class Match
property value : Bytes
property start : UInt64
property size : UInt64
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
end
end
end
# pattern = "foo"
# str = "foo bar"
# re = BytesRegex::Regex.new(pattern)
# p! String.new(re.match(str.to_slice)[0].value)

1160
src/constants/lexers.cr Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -22,6 +22,7 @@ module Tartrazine
"GenericSubheading" => "gs",
"GenericTraceback" => "gt",
"GenericUnderline" => "gu",
"Highlight" => "hl",
"Keyword" => "k",
"KeywordConstant" => "kc",
"KeywordDeclaration" => "kd",

View File

@@ -1,5 +1,4 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
@@ -10,12 +9,15 @@ module Tartrazine
# This is the base class for all formatters.
abstract class Formatter
property name : String = ""
property theme : Theme = Tartrazine.theme("default-dark")
def format(text : String, lexer : Lexer, theme : Theme) : String
# Format the text using the given lexer.
def format(text : String, lexer : Lexer) : String
raise Exception.new("Not implemented")
end
def get_style_defs(theme : Theme) : String
# Return the styles, if the formatter supports it.
def style_defs : String
raise Exception.new("Not implemented")
end
end

View File

@@ -2,16 +2,25 @@ require "../formatter"
module Tartrazine
class Ansi < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
property? line_numbers : Bool = false
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
def format(text : String, lexer : Lexer) : String
output = String.build do |outp|
lexer.tokenize(text).each do |token|
outp << self.colorize(token[:value], token[:type], theme)
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
outp << label
line.each do |token|
outp << colorize(token[:value], token[:type])
end
end
end
output
end
def colorize(text : String, token : String, theme : Theme) : String
def colorize(text : String, token : String) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)

View File

@@ -1,3 +1,4 @@
require "../constants/token_abbrevs.cr"
require "../formatter"
module Tartrazine
@@ -14,20 +15,37 @@ module Tartrazine
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property weight_of_bold : Int32 = 600
def format(text : String, lexer : Lexer, theme : Theme) : String
text = format_text(text, lexer, theme)
property theme : Theme
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), *,
@highlight_lines = [] of Range(Int32, Int32),
@class_prefix : String = "",
@line_number_id_prefix = "line-",
@line_number_start = 1,
@tab_width = 8,
@line_numbers : Bool = false,
@linkable_line_numbers : Bool = true,
@standalone : Bool = false,
@surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600)
end
def format(text : String, lexer : Lexer) : String
text = format_text(text, lexer)
if standalone?
text = wrap_standalone(text, theme)
text = wrap_standalone(text)
end
text
end
# Wrap text into a full HTML document, including the CSS for the theme
def wrap_standalone(text, theme) : String
def wrap_standalone(text) : String
output = String.build do |outp|
outp << "<!DOCTYPE html><html><head><style>"
outp << get_style_defs(theme)
outp << style_defs
outp << "</style></head><body>"
outp << text
outp << "</body></html>"
@@ -35,21 +53,21 @@ module Tartrazine
output
end
def format_text(text : String, lexer : Lexer, theme : Theme) : String
lines = group_tokens_in_lines(lexer.tokenize(text))
def format_text(text : String, lexer : Lexer) : String
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
output = String.build do |outp|
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background", theme)}\" #{pre_style}>"
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
end
"<code class=\"#{get_css_class("Background", theme)}\">"
outp << "<code class=\"#{get_css_class("Background")}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight", theme)}\"" : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
outp << fragment
end
end
@@ -59,10 +77,10 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def get_style_defs(theme : Theme) : String
def style_defs : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token, theme)} {"
outp << ".#{get_css_class(token)} {"
# These are set or nil
outp << "color: ##{style.color.try &.hex};" if style.color
outp << "background-color: ##{style.background.try &.hex};" if style.background
@@ -71,7 +89,7 @@ module Tartrazine
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: 400;" if style.bold == false
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline
@@ -85,7 +103,7 @@ module Tartrazine
end
# Given a token type, return the CSS class to use.
def get_css_class(token, theme)
def get_css_class(token : String) : String
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
@@ -97,31 +115,9 @@ module Tartrazine
}]
end
# Is this line in the highlighted ranges?
def highlighted?(line : Int) : Bool
highlight_lines.any?(&.includes?(line))
end
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
lines = [Array(Token).new]
split_tokens.each do |token|
lines.last << token
if token[:value].includes?("\n")
lines << Array(Token).new
end
end
lines
end
end
end

View File

@@ -1,3 +1,6 @@
require "baked_file_system"
require "./constants/lexers"
module Tartrazine
class LexerFiles
extend BakedFileSystem
@@ -5,6 +8,36 @@ module Tartrazine
bake_folder "../lexers", __DIR__
end
# Get the lexer object for a language name
# FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer
if name.nil? && filename.nil?
lexer_file_name = LEXERS_BY_NAME["plaintext"]
elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase]
else
# Guess by filename
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename.to_s))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
# Return a list of all lexers
def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort!
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
@@ -33,7 +66,7 @@ module Tartrazine
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
def tokenize(text : String, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
@@ -44,12 +77,13 @@ module Tartrazine
text += "\n"
end
text_bytes = text.to_slice
# Loop through the text, applying rules
while pos < text.size
while pos < text_bytes.size
state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
@@ -62,8 +96,13 @@ module Tartrazine
end
# If no rule matches, emit an error token
unless matched
# Log.trace { "Error at #{pos}" }
tokens << {type: "Error", value: "#{text[pos]}"}
if text_bytes[pos] == 10u8
# at EOL, reset state to "root"
tokens << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
end
pos += 1
end
end
@@ -92,6 +131,30 @@ module Tartrazine
result
end
# Group tokens into lines, splitting them when a newline is found
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
lines = [Array(Token).new]
split_tokens.each do |token|
lines.last << token
if token[:value].includes?("\n")
lines << Array(Token).new
end
end
lines
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
@@ -173,8 +236,4 @@ module Tartrazine
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
def self.lexer(name : String) : Lexer
Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
end
end

View File

@@ -1,35 +1,97 @@
require "docopt"
require "./**"
HELP = <<-HELP
tartrazine: a syntax highlighting tool
Usage:
tartrazine (-h, --help)
tartrazine FILE -f html [-t theme][--standalone][--line-numbers]
[-l lexer] [-o output][--css]
tartrazine FILE -f terminal [-t theme][-l lexer][-o output]
[-l lexer][-o output]
tartrazine -f html -t theme --css
tartrazine FILE -f terminal [-t theme][-l lexer][--line-numbers]
[-o output]
tartrazine FILE -f json [-o output]
tartrazine --list-themes
tartrazine --list-lexers
tartrazine --list-formatters
tartrazine --version
-f <formatter> Format to use (html, terminal, json)
-t <theme> Theme to use (see --list-themes)
-l <lexer> Lexer (language) to use (see --list-lexers)
-o <output> Output file (default: stdout)
--standalone Generate a standalone HTML file
--css Generate a CSS file for the theme
--line-numbers Include line numbers in the output
Options:
-f <formatter> Format to use (html, terminal, json)
-t <theme> Theme to use, see --list-themes [default: default-dark]
-l <lexer> Lexer (language) to use, see --list-lexers [default: autodetect]
-o <output> Output file. Default is stdout.
--standalone Generate a standalone HTML file, which includes
all style information. If not given, it will generate just
a HTML fragment ready to include in your own page.
--css Generate a CSS file for the theme called <theme>.css
--line-numbers Include line numbers in the output
-h, --help Show this screen
-v, --version Show version number
HELP
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme(ARGV[1])
# formatter = Tartrazine::Json.new
formatter = Tartrazine::Html.new
formatter.standalone = true
formatter.class_prefix = "hl-"
formatter.line_number_id_prefix = "ln-"
formatter.line_numbers = true
formatter.highlight_lines = [3..7, 20..30]
formatter.linkable_line_numbers = false
formatter.wrap_long_lines = false
puts formatter.format(File.read(ARGV[0]), lexer, theme)
options = Docopt.docopt(HELP, ARGV)
# Handle version manually
if options["--version"]
puts "tartrazine #{Tartrazine::VERSION}"
exit 0
end
if options["--list-themes"]
puts Tartrazine.themes.join("\n")
exit 0
end
if options["--list-lexers"]
puts Tartrazine.lexers.join("\n")
exit 0
end
if options["--list-formatters"]
puts "html\njson\nterminal"
exit 0
end
theme = Tartrazine.theme(options["-t"].as(String))
if options["-f"]
formatter = options["-f"].as(String)
case formatter
when "html"
formatter = Tartrazine::Html.new
formatter.standalone = options["--standalone"] != nil
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "terminal"
formatter = Tartrazine::Ansi.new
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "json"
formatter = Tartrazine::Json.new
else
puts "Invalid formatter: #{formatter}"
exit 1
end
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf.puts formatter.style_defs
end
exit 0
end
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end
output = formatter.format(input, lexer)
if options["-o"].nil?
puts output
else
File.open(options["-o"].as(String), "w") do |outf|
outf.puts output
end
end
end

View File

@@ -1,9 +1,9 @@
require "./actions"
require "./constants"
require "./bytes_regex"
require "./formatter"
require "./lexer"
require "./rules"
require "./styles"
require "./lexer"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -11,16 +11,21 @@ require "./lexer"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = BytesRegex::Regex
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
class Rule
property pattern : Regex = Re2.new ""
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil? || match.end == 0
return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
@@ -28,18 +33,21 @@ module Tartrazine
# Emit the token
tokens += action.emit(match, lexer)
end
Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match.end, tokens
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, pos + match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
@pattern = Re2.new(
node["pattern"],
multiline,
dotall,
ignorecase,
anchored: true)
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
# flags |= Regex::Options::DOTALL if dotall
# flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
@@ -81,7 +89,7 @@ module Tartrazine
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
actions.each do |action|
tokens += action.emit(nil, lexer)
tokens += action.emit([] of Match, lexer)
end
return true, pos, tokens
end
@@ -91,25 +99,4 @@ module Tartrazine
add_actions(node)
end
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
flags |= LibPCRE2::NO_UTF_CHECK
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
end

View File

@@ -1,5 +1,4 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
@@ -10,6 +9,11 @@ require "xml"
module Tartrazine
alias Color = Sixteen::Color
class ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
def self.theme(name : String) : Theme
begin
return Theme.from_base16(name)
@@ -23,9 +27,16 @@ module Tartrazine
end
end
class ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
# Return a list of all themes
def self.themes
themes = Set(String).new
ThemeFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
Sixteen::DataFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
themes.to_a.sort!
end
class Style

View File

@@ -1,5 +1,4 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
@@ -12,7 +11,7 @@ require "xml"
module Tartrazine
extend self
VERSION = "0.1.1"
VERSION = {{ `shards version #{__DIR__}`.chomp.stringify }}
Log = ::Log.for("tartrazine")
end