3 Commits

Author SHA1 Message Date
2a19f3889f Not going to work 2024-08-07 15:03:53 -03:00
b9e51824df Exploring re2, doesn't really work 2024-08-07 14:41:02 -03:00
ff1c0012ec Exploring re2, doesn't really work 2024-08-07 13:11:19 -03:00
31 changed files with 822 additions and 2271 deletions

View File

@@ -1,5 +1,5 @@
# This configuration file was generated by `ameba --gen-config`
# on 2024-08-12 22:00:49 UTC using Ameba version 1.6.1.
# on 2024-08-04 23:09:09 UTC using Ameba version 1.6.1.
# The point is for the user to remove these configuration records
# one by one as the reported problems are removed from the code base.
@@ -9,7 +9,7 @@ Documentation/DocumentationAdmonition:
Description: Reports documentation admonitions
Timezone: UTC
Excluded:
- src/lexer.cr
- src/tartrazine.cr
- src/actions.cr
Admonitions:
- TODO
@@ -17,105 +17,3 @@ Documentation/DocumentationAdmonition:
- BUG
Enabled: true
Severity: Warning
# Problems found: 22
# Run `ameba --only Lint/MissingBlockArgument` for details
Lint/MissingBlockArgument:
Description: Disallows yielding method definitions without block argument
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/NotNil` for details
Lint/NotNil:
Description: Identifies usage of `not_nil!` calls
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 34
# Run `ameba --only Lint/ShadowingOuterLocalVar` for details
Lint/ShadowingOuterLocalVar:
Description: Disallows the usage of the same name as outer local variables for block
or proc arguments
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/UnreachableCode` for details
Lint/UnreachableCode:
Description: Reports unreachable code
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 6
# Run `ameba --only Lint/UselessAssign` for details
Lint/UselessAssign:
Description: Disallows useless variable assignments
ExcludeTypeDeclarations: false
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 3
# Run `ameba --only Naming/BlockParameterName` for details
Naming/BlockParameterName:
Description: Disallows non-descriptive block parameter names
MinNameLength: 3
AllowNamesEndingInNumbers: true
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- _
- e
- i
- j
- k
- v
- x
- y
- ex
- io
- ws
- op
- tx
- id
- ip
- k1
- k2
- v1
- v2
ForbiddenNames: []
Enabled: true
Severity: Convention
# Problems found: 1
# Run `ameba --only Naming/RescuedExceptionsVariableName` for details
Naming/RescuedExceptionsVariableName:
Description: Makes sure that rescued exceptions variables are named as expected
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- e
- ex
- exception
- error
Enabled: true
Severity: Convention
# Problems found: 6
# Run `ameba --only Naming/TypeNames` for details
Naming/TypeNames:
Description: Enforces type names in camelcase manner
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Convention

1
.gitignore vendored
View File

@@ -6,4 +6,3 @@
chroma/
pygments/
shard.lock
.vscode/

View File

@@ -1,15 +0,0 @@
FROM --platform=${TARGETPLATFORM:-linux/amd64} alpine:3.20 AS build
RUN apk add --no-cache \
crystal \
shards \
yaml-dev \
yaml-static \
openssl-dev \
openssl-libs-static \
libxml2-dev \
libxml2-static \
zlib-dev \
zlib-static \
xz-dev \
xz-static \
make

View File

@@ -1,7 +0,0 @@
build: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build -Dstrict_multi_assign -Dno_number_autocast -d --error-trace
release: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release
static: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release --static
strip bin/tartrazine

View File

@@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of.
The CLI tool can be used to highlight many things in many styles.
It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"?
Pygments is a staple of the Python ecosystem, and it's great.
It lets you highlight code in many languages, and it has many
themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Because I did not read the Pygments code. And this is actually
based on [Chroma](https://github.com/alecthomas/chroma) ...
although I did not read that code either.
Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma
@@ -31,21 +31,14 @@ is a subset of Pygments'.
Currently Tartrazine supports ... 241 languages.
It has 331 themes (63 from Chroma, the rest are base16 themes via
It has 332 themes (64 from Chroma, the rest are base16 themes via
[Sixteen](https://github.com/ralsina/sixteen)
## Installation
From prebuilt binaries:
This will have a CLI tool that can be installed, but it's not
there yet.
Each release provides statically-linked binaries that should
work on any Linux. Get them from the [releases page](https://github.com/ralsina/tartrazine/releases) and put them in your PATH.
To build from source:
1. Clone this repo
2. Run `make` to build the `tartrazine` binary
3. Copy the binary somewhere in your PATH.
## Usage
@@ -69,4 +62,4 @@ puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)
## Contributors
- [Roberto Alsina](https://github.com/ralsina) - creator and maintainer
- [Roberto Alsina](https://github.com/ralsina) - creator and maintainer

10
TODO.md
View File

@@ -2,10 +2,6 @@
## TODO
* Implement styles
* Implement formatters
* Implement CLI
* ✅ Implement lexer loader that respects aliases
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type
* Implement styles
* Implement formatters
* Implement lexer loader that respects aliases, etc

View File

@@ -1,16 +0,0 @@
#!/bin/bash
set -e
docker run --rm --privileged \
multiarch/qemu-user-static \
--reset -p yes
# Build for AMD64
docker build . -f Dockerfile.static -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-amd64
# Build for ARM64
docker build . -f Dockerfile.static --platform linux/arm64 -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --platform linux/arm64 --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-arm64

View File

@@ -1,54 +0,0 @@
# This script parses the metadata of all the lexers and generates
# a datafile with all the information so we don't have to instantiate
# all the lexers to get the information.
import glob
from collections import defaultdict
lexer_by_name = {}
lexer_by_mimetype = defaultdict(set)
lexer_by_filename = defaultdict(set)
for fname in glob.glob("lexers/*.xml"):
aliases = set([])
mimetypes = set([])
filenames = set([])
print(fname)
with open(fname) as f:
lexer_name = fname.split("/")[-1].split(".")[0]
for line in f:
if "</config" in line:
break
if "<filename>" in line:
filenames.add(line.split(">")[1].split("<")[0].lower())
if "<mime_type>" in line:
mimetypes.add(line.split(">")[1].split("<")[0].lower())
if "<alias>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
if "<name>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
for alias in aliases:
if alias in lexer_by_name and alias != lexer_by_name[alias]:
raise Exception(f"Alias {alias} already in use by {lexer_by_name[alias]}")
lexer_by_name[alias] = lexer_name
for mimetype in mimetypes:
lexer_by_mimetype[mimetype] = lexer_name
for filename in filenames:
lexer_by_filename[filename].add(lexer_name)
with open("src/constants/lexers.cr", "w") as f:
f.write("module Tartrazine\n")
f.write(" LEXERS_BY_NAME = {\n")
for k, v in lexer_by_name.items():
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_MIMETYPE = {\n")
for k, v in lexer_by_mimetype.items():
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_FILENAME = {\n")
for k, v in lexer_by_filename.items():
f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n')
f.write("}\n")
f.write("end\n")

View File

@@ -1,55 +1,24 @@
# Script to generate abbreviations for tokens. Parses all lexers
# and styles files to find all token names and generate a unique
# abbreviation for each one. The abbreviations are generated by
# taking the uppercase letters of the token name and converting
# them to lowercase. If the abbreviation is not unique, the script
# will print a warning and exit.
import sys
import string
import glob
tokens = {"Highlight"}
abbrevs = {"Highlight": "hl"}
# Run it as grep token lexers/* | python scripts/token_abbrevs.py
def abbr(line):
return "".join(c for c in line if c in string.ascii_uppercase).lower()
def check_abbrevs():
if len(abbrevs) != len(tokens):
print("Warning: Abbreviations are not unique")
print(len(abbrevs), len(tokens))
sys.exit(1)
abbrevs = {}
tokens = set([])
for line in sys.stdin:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
# Processes all files in lexers looking for token names
for fname in glob.glob("lexers/*.xml"):
with open(fname) as f:
for line in f:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
check_abbrevs()
# Processes all files in styles looking for token names too
for fname in glob.glob("styles/*.xml"):
with open(fname) as f:
for line in f:
if "<entry" not in line:
continue
line = line.strip()
line = line.split('type=',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
check_abbrevs()
with open ("src/constants/token_abbrevs.cr", "w") as outf:
outf.write("module Tartrazine\n")
outf.write(" Abbreviations = {\n")
for k in sorted(abbrevs.keys()):
outf.write(f' "{k}" => "{abbrevs[k]}",\n')
outf.write(" }\nend\n")
print("Abbreviations: {")
for k, v in abbrevs.items():
print(f' "{k}" => "{v}",')
print("}")

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.4.0
version: 0.1.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>
@@ -9,14 +9,11 @@ targets:
main: src/main.cr
dependencies:
baked_file_system:
github: schovi/baked_file_system
base58:
github: crystal-china/base58.cr
sixteen:
github: ralsina/sixteen
docopt:
github: chenkovsky/docopt.cr
branch: main
crystal: ">= 1.13.0"

View File

@@ -14,18 +14,15 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
}
# These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct
bad_in_chroma = {
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
"#{__DIR__}/tests/html/javascript_backtracking.txt",
"#{__DIR__}/tests/java/test_default.txt",
"#{__DIR__}/tests/java/test_multiline_string.txt",
"#{__DIR__}/tests/java/test_numeric_literals.txt",
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
}
@@ -33,14 +30,19 @@ bad_in_chroma = {
known_bad = {
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
"#{__DIR__}/tests/html/css_backtracking.txt",
"#{__DIR__}/tests/mcfunction/data.txt",
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
}
# Tests that fail because of a limitation in PCRE2

View File

@@ -1,8 +1,4 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "xml"
# These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens
@@ -30,11 +26,11 @@ module Tartrazine
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : MatchData, lexer : Lexer, match_group = 0) : Array(Token)
def emit(matches : Pointer(LibCre2::StringPiece), lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: xml["type"], value: String.new(match[match_group].value))]
raise Exception.new "Can't have a token without a match" if matches[0].length == 0
[Token.new(type: xml["type"], value: String.new(Slice.new(matches[0].data, matches[0].length)))]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
@@ -67,41 +63,37 @@ module Tartrazine
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
# <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil?
raise Exception.new "Can't have a bygroups without a match" if matches[0].length == 0
# Each group matches an action. If the group match is empty,
# the action is skipped.
result = [] of Token
@actions.each_with_index do |e, i|
begin
next if match[i + 1].size == 0
rescue IndexError
# FIXME: This should not actually happen
# No match for this group
next
end
result += e.emit(match, lexer, i + 1)
next if matches[i].length == 0
result += e.emit(matches, lexer, i)
end
result
when "using"
# Shunt to another lexer entirely
return [] of Token if match.empty?
return [] of Token if matches[0].length == 0
lexer_name = xml["lexer"].downcase
Log.trace { "to tokenize: #{match[match_group]}" }
Tartrazine.lexer(lexer_name).tokenize(String.new(match[match_group].value), usingself: true)
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.empty?
return [] of Token if matches[0].length == 0
new_lexer = Lexer.from_xml(lexer.xml)
Log.trace { "to tokenize: #{match[match_group]}" }
new_lexer.tokenize(String.new(match[match_group].value), usingself: true)
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = String.new(Slice.new(matches[match_group].data, matches[match_group].length))
new_lexer.tokenize(to_tokenize, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|

View File

@@ -1,75 +0,0 @@
module BytesRegex
extend self
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
if @re = LibPCRE2.compile(
pattern,
pattern.bytesize,
flags,
out errorcode,
out erroroffset,
nil)
else
msg = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
end
def finalize
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
match = [] of Match
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
LibPCRE2::NO_UTF_CHECK,
match_data,
nil)
if rc < 0
# No match, do nothing
else
ovector = LibPCRE2.get_ovector_pointer(match_data)
(0...rc).each do |i|
m_start = ovector[2 * i]
m_size = ovector[2 * i + 1] - m_start
if m_size == 0
m_value = Bytes.new(0)
else
m_value = str[m_start...m_start + m_size]
end
match << Match.new(m_value, m_start, m_size)
end
end
LibPCRE2.match_data_free(match_data)
match
end
end
class Match
property value : Bytes
property start : UInt64
property size : UInt64
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
end
end
end
# pattern = "foo"
# str = "foo bar"
# re = BytesRegex::Regex.new(pattern)
# p! String.new(re.match(str.to_slice)[0].value)

View File

@@ -1,100 +1,92 @@
module Tartrazine
Abbreviations = {
"Background" => "b",
"CodeLine" => "cl",
"Comment" => "c",
"CommentHashbang" => "ch",
"CommentMultiline" => "cm",
"CommentPreproc" => "cp",
"CommentPreprocFile" => "cpf",
"Text" => "t",
"CommentSingle" => "cs",
"CommentSpecial" => "cs",
"Error" => "e",
"Generic" => "g",
"GenericDeleted" => "gd",
"GenericEmph" => "ge",
"GenericError" => "ge",
"GenericHeading" => "gh",
"GenericInserted" => "gi",
"GenericOutput" => "go",
"GenericPrompt" => "gp",
"GenericStrong" => "gs",
"GenericSubheading" => "gs",
"GenericTraceback" => "gt",
"GenericUnderline" => "gu",
"Highlight" => "hl",
"NameVariable" => "nv",
"Keyword" => "k",
"KeywordConstant" => "kc",
"KeywordDeclaration" => "kd",
"KeywordNamespace" => "kn",
"KeywordPseudo" => "kp",
"KeywordReserved" => "kr",
"KeywordType" => "kt",
"LineHighlight" => "lh",
"LineNumbers" => "ln",
"LineNumbersTable" => "lnt",
"LineTable" => "lt",
"LineTableTD" => "lttd",
"NameFunction" => "nf",
"Punctuation" => "p",
"Operator" => "o",
"LiteralNumberInteger" => "lni",
"NameBuiltin" => "nb",
"Name" => "n",
"OperatorWord" => "ow",
"LiteralStringSingle" => "lss",
"Literal" => "l",
"LiteralDate" => "ld",
"LiteralNumber" => "ln",
"LiteralNumberBin" => "lnb",
"NameClass" => "nc",
"CommentMultiline" => "cm",
"LiteralStringRegex" => "lsr",
"KeywordDeclaration" => "kd",
"KeywordConstant" => "kc",
"NameOther" => "no",
"LiteralNumberFloat" => "lnf",
"LiteralNumberHex" => "lnh",
"LiteralNumberInteger" => "lni",
"LiteralNumberIntegerLong" => "lnil",
"LiteralNumberOct" => "lno",
"LiteralOther" => "lo",
"LiteralString" => "ls",
"LiteralStringAffix" => "lsa",
"LiteralStringAtom" => "lsa",
"LiteralStringBacktick" => "lsb",
"LiteralStringBoolean" => "lsb",
"LiteralStringChar" => "lsc",
"LiteralStringDelimiter" => "lsd",
"LiteralStringDoc" => "lsd",
"LiteralStringDouble" => "lsd",
"LiteralStringEscape" => "lse",
"LiteralStringHeredoc" => "lsh",
"LiteralStringInterpol" => "lsi",
"LiteralStringName" => "lsn",
"LiteralStringOther" => "lso",
"LiteralStringRegex" => "lsr",
"LiteralStringSingle" => "lss",
"LiteralStringSymbol" => "lss",
"Name" => "n",
"NameAttribute" => "na",
"NameBuiltin" => "nb",
"NameBuiltinPseudo" => "nbp",
"NameClass" => "nc",
"NameConstant" => "nc",
"NameDecorator" => "nd",
"NameEntity" => "ne",
"NameException" => "ne",
"NameFunction" => "nf",
"NameFunctionMagic" => "nfm",
"NameKeyword" => "nk",
"NameLabel" => "nl",
"KeywordType" => "kt",
"NameNamespace" => "nn",
"NameOperator" => "no",
"NameOther" => "no",
"NameProperty" => "np",
"NamePseudo" => "np",
"NameTag" => "nt",
"NameVariable" => "nv",
"NameVariableAnonymous" => "nva",
"NameVariableClass" => "nvc",
"NameVariableGlobal" => "nvg",
"NameVariableInstance" => "nvi",
"NameVariableMagic" => "nvm",
"None" => "n",
"Operator" => "o",
"OperatorWord" => "ow",
"Other" => "o",
"Punctuation" => "p",
"Text" => "t",
"TextPunctuation" => "tp",
"TextSymbol" => "ts",
"NameAttribute" => "na",
"KeywordReserved" => "kr",
"CommentPreproc" => "cp",
"KeywordNamespace" => "kn",
"NameConstant" => "nc",
"NameLabel" => "nl",
"LiteralString" => "ls",
"LiteralStringChar" => "lsc",
"TextWhitespace" => "tw",
"LiteralStringEscape" => "lse",
"LiteralNumber" => "ln",
"Other" => "o",
"LiteralStringBoolean" => "lsb",
"NameProperty" => "np",
"Comment" => "c",
"NameTag" => "nt",
"LiteralStringOther" => "lso",
"NameVariableGlobal" => "nvg",
"NameBuiltinPseudo" => "nbp",
"LiteralNumberBin" => "lnb",
"KeywordPseudo" => "kp",
"CommentPreprocFile" => "cpf",
"LiteralStringAffix" => "lsa",
"LiteralStringDelimiter" => "lsd",
"LiteralNumberOct" => "lno",
"Error" => "e",
"Generic" => "g",
"LiteralNumberIntegerLong" => "lnil",
"NameDecorator" => "nd",
"LiteralStringInterpol" => "lsi",
"LiteralStringBacktick" => "lsb",
"GenericPrompt" => "gp",
"GenericOutput" => "go",
"LiteralStringName" => "lsn",
"LiteralStringHeredoc" => "lsh",
"LiteralStringSymbol" => "lss",
"NameVariableInstance" => "nvi",
"LiteralOther" => "lo",
"NameVariableClass" => "nvc",
"NameOperator" => "no",
"None" => "n",
"LiteralStringDoc" => "lsd",
"NameException" => "ne",
"GenericSubheading" => "gs",
"GenericStrong" => "gs",
"GenericDeleted" => "gd",
"GenericInserted" => "gi",
"GenericHeading" => "gh",
"NameEntity" => "ne",
"NamePseudo" => "np",
"CommentHashbang" => "ch",
"TextPunctuation" => "tp",
"NameVariableAnonymous" => "nva",
"NameVariableMagic" => "nvm",
"NameFunctionMagic" => "nfm",
"GenericEmph" => "ge",
"GenericUnderline" => "gu",
"LiteralStringAtom" => "lsa",
"LiteralDate" => "ld",
"GenericError" => "ge",
"TextSymbol" => "ts",
"NameKeyword" => "nk",
}
end

File diff suppressed because it is too large Load Diff

5
src/cre2/Makefile Normal file
View File

@@ -0,0 +1,5 @@
all: cre2.o
clean:
rm -f cre2.o
cre2.o: cre2.cpp cre2.h
g++ -O3 -c -o cre2.o cre2.cpp

122
src/cre2/cre2.cpp Normal file
View File

@@ -0,0 +1,122 @@
#include <re2/re2.h>
#include "cre2.h"
#define TO_OPT(opt) (reinterpret_cast<RE2::Options *>(opt))
cre2_options *cre2_opt_new(void) {
return reinterpret_cast<void*>(new RE2::Options());
}
void cre2_opt_delete(cre2_options *opt) {
delete TO_OPT(opt);
}
#define OPT_bool(name) \
void cre2_opt_##name(cre2_options *opt, int flag) { \
TO_OPT(opt)->set_##name(bool(flag)); \
}
OPT_bool(posix_syntax)
OPT_bool(longest_match)
OPT_bool(log_errors)
OPT_bool(literal)
OPT_bool(never_nl)
OPT_bool(dot_nl)
OPT_bool(case_sensitive)
OPT_bool(perl_classes)
OPT_bool(word_boundary)
OPT_bool(one_line)
#undef OPT_BOOL
void cre2_opt_encoding(cre2_options *opt, encoding_t enc) {
switch (enc) {
case CRE2_UTF8:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingUTF8);
break;
case CRE2_Latin1:
TO_OPT(opt)->set_encoding(RE2::Options::EncodingLatin1);
break;
}
}
void cre2_opt_max_mem(cre2_options *opt, int m) {
TO_OPT(opt)->set_max_mem(m);
}
#define TO_RE2(re) (reinterpret_cast<RE2 *>(re))
#define TO_CONST_RE2(re) (reinterpret_cast<const RE2 *>(re))
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt) {
re2::StringPiece pattern_re2(pattern, patternlen);
return reinterpret_cast<void*>(
new RE2(pattern_re2, *reinterpret_cast<const RE2::Options *>(opt)));
}
void cre2_delete(cre2 *re) {
delete TO_RE2(re);
}
int cre2_error_code(const cre2 *re) {
return int(TO_CONST_RE2(re)->error_code());
}
const char *cre2_error_string(const cre2 *re) {
return TO_CONST_RE2(re)->error().c_str();
}
void cre2_error_arg(const cre2 *re, struct string_piece *arg) {
const std::string &argstr = TO_CONST_RE2(re)->error_arg();
arg->data = argstr.data();
arg->length = argstr.length();
}
int cre2_num_capturing_groups(const cre2 *re) {
return TO_CONST_RE2(re)->NumberOfCapturingGroups();
}
int cre2_program_size(const cre2 *re) {
return TO_CONST_RE2(re)->ProgramSize();
}
int cre2_match(
const cre2 *re
, const char *text
, int textlen
, int startpos
, int endpos
, anchor_t anchor
, struct string_piece *match
, int nmatch) {
re2::StringPiece text_re2(text, textlen);
// FIXME: exceptions?
re2::StringPiece *match_re2 = new re2::StringPiece[nmatch];
RE2::Anchor anchor_re2 = RE2::UNANCHORED;
switch (anchor) {
case CRE2_ANCHOR_START:
anchor_re2 = RE2::ANCHOR_START; break;
case CRE2_ANCHOR_BOTH:
anchor_re2 = RE2::ANCHOR_BOTH; break;
}
bool ret = TO_CONST_RE2(re)
->Match(text_re2, startpos, endpos, anchor_re2, match_re2, nmatch);
if (ret) {
for (int i=0; i<nmatch; i++) {
match[i].data = match_re2[i].data();
match[i].length = match_re2[i].length();
}
}
delete [] match_re2;
return int(ret);
}

70
src/cre2/cre2.cr Normal file
View File

@@ -0,0 +1,70 @@
@[Link(ldflags: "#{__DIR__}/cre2.o -Wl,--copy-dt-needed-entries `pkg-config --libs re2`")]
lib LibCre2
type Options = Void*
fun opt_new = cre2_opt_new : Options
fun opt_delete = cre2_opt_delete(op : Options) : Nil
fun opt_posix_syntax = cre2_opt_posix_syntax(op : Options, flag : Bool) : Nil
fun opt_longest_match = cre2_opt_longest_match(op : Options, flag : Bool) : Nil
fun opt_log_errors = cre2_opt_log_errors(op : Options, flag : Bool) : Nil
fun opt_literal = cre2_opt_literal(op : Options, flag : Bool) : Nil
fun opt_never_nl = cre2_opt_never_nl(op : Options, flag : Bool) : Nil
fun opt_case_sensitive = cre2_opt_case_sensitive(op : Options, flag : Bool) : Nil
fun opt_perl_classes = cre2_opt_perl_classes(op : Options, flag : Bool) : Nil
fun opt_word_boundary = cre2_opt_word_boundary(op : Options, flag : Bool) : Nil
fun opt_one_line = cre2_opt_one_line(op : Options, flag : Bool) : Nil
fun opt_dot_nl = cre2_opt_dot_nl(op : Options, flag : Bool) : Nil
fun opt_encoding = cre2_opt_encoding(op : Options, encoding : Int32) : Nil
fun opt_max_mem = cre2_opt_max_mem(op : Options, flag : Bool) : Nil
struct StringPiece
data : LibC::Char*
length : Int32
end
type CRe2 = Void*
fun new = cre2_new(pattern : LibC::Char*, patternlen : UInt32, opt : Options) : CRe2
fun del = cre2_delete(re : CRe2) : Nil
fun error_code = cre2_error_core(re : CRe2) : Int32
fun num_capturing_groups(re : CRe2) : Int32
fun program_size(re : CRe2) : Int32
# Invalidated by further re use
fun error_string = cre2_error_string(re : CRe2) : LibC::Char*
fun error_arg = cre2_error_arg(re : CRe2, arg : StringPiece*) : Nil
CRE2_UNANCHORED = 1
CRE2_ANCHOR_START = 2
CRE2_ANCHOR_BOTH = 3
fun match = cre2_match(
re : CRe2,
text : LibC::Char*,
textlen : UInt32,
startpos : UInt32,
endpos : UInt32,
anchor : Int32,
match : StringPiece*,
nmatch : Int32
) : Int32
end
# match = Pointer(LibCre2::StringPiece).malloc(10)
# opts = LibCre2.opt_new
# LibCre2.opt_posix_syntax(opts, true)
# LibCre2.opt_longest_match(opts, true)
# LibCre2.opt_perl_classes(opts, true)
# LibCre2.opt_encoding(opts, 1)
# # LibCre2.opt_one_line(opts, false)
# # LibCre2.opt_never_nl(opts, false)
# pattern = "(\\s+)(foo)"
# text = " foo"
# re = LibCre2.new(pattern, pattern.size, opts)
# p! LibCre2.match(re, text, text.size, 0, text.size,
# LibCre2::CRE2_ANCHOR_START, match, 10)
# (0...10).each do |i|
# p! String.new(Slice.new(match[i].data, match[i].length))
# end

67
src/cre2/cre2.h Normal file
View File

@@ -0,0 +1,67 @@
#ifdef __cplusplus
extern "C" {
#endif
typedef void cre2_options;
typedef int encoding_t;
#define CRE2_UTF8 1
#define CRE2_Latin1 2
cre2_options *cre2_opt_new(void);
void cre2_opt_delete(cre2_options *opt);
void cre2_opt_posix_syntax(cre2_options *opt, int flag);
void cre2_opt_longest_match(cre2_options *opt, int flag);
void cre2_opt_log_errors(cre2_options *opt, int flag);
void cre2_opt_literal(cre2_options *opt, int flag);
void cre2_opt_never_nl(cre2_options *opt, int flag);
void cre2_opt_case_sensitive(cre2_options *opt, int flag);
void cre2_opt_perl_classes(cre2_options *opt, int flag);
void cre2_opt_word_boundary(cre2_options *opt, int flag);
void cre2_opt_one_line(cre2_options *opt, int flag);
void cre2_opt_dot_nl(cre2_options *opt, int flag);
void cre2_opt_encoding(cre2_options *opt, encoding_t enc);
void cre2_opt_max_mem(cre2_options *opt, int m);
struct string_piece {
const char *data;
int length;
};
typedef void cre2;
cre2 *cre2_new(const char *pattern, int patternlen, const cre2_options *opt);
void cre2_delete(cre2 *re);
int cre2_error_code(const cre2 *re);
int cre2_num_capturing_groups(const cre2 *re);
int cre2_program_size(const cre2 *re);
// invalidated by further re use
const char *cre2_error_string(const cre2 *re);
void cre2_error_arg(const cre2 *re, struct string_piece *arg);
typedef int anchor_t;
#define CRE2_UNANCHORED 1
#define CRE2_ANCHOR_START 2
#define CRE2_ANCHOR_BOTH 3
int cre2_match(
const cre2 *re
, const char *text
, int textlen
, int startpos
, int endpos
, anchor_t anchor
, struct string_piece *match
, int nmatch);
#ifdef __cplusplus
} // extern "C"
#endif

BIN
src/cre2/cre2.o Normal file

Binary file not shown.

View File

@@ -1,24 +1,69 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "colorize"
require "./constants.cr"
require "./styles.cr"
require "./tartrazine.cr"
module Tartrazine
# This is the base class for all formatters.
abstract class Formatter
property name : String = ""
property theme : Theme = Tartrazine.theme("default-dark")
# Format the text using the given lexer.
def format(text : String, lexer : Lexer) : String
def format(text : String, lexer : Lexer, theme : Theme) : String
raise Exception.new("Not implemented")
end
# Return the styles, if the formatter supports it.
def style_defs : String
raise Exception.new("Not implemented")
# ameba:disable Metrics/CyclomaticComplexity
def get_style_defs(theme : Theme) : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token, theme)} {"
# These are set or nil
outp << "color: #{style.color};" if style.color
outp << "background-color: #{style.background};" if style.background
outp << "border: 1px solid #{style.border};" if style.border
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: 400;" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline
outp << "text-decoration: none;" if style.underline == false
outp << "}"
end
end
output
end
end
class Html < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
outp << "<html><head><style>"
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
lexer.tokenize(text).each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
outp << "</code></pre></body></html>"
end
output
end
# Given a token type, return the CSS class to use.
def get_css_class(token, theme)
return Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
end
end

View File

@@ -1,46 +0,0 @@
require "../formatter"
module Tartrazine
class Ansi < Formatter
property? line_numbers : Bool = false
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
def format(text : String, lexer : Lexer) : String
output = String.build do |outp|
lexer.group_tokens_in_lines(lexer.tokenize(text)).each_with_index do |line, i|
label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
outp << label
line.each do |token|
outp << colorize(token[:value], token[:type])
end
end
end
output
end
def colorize(text : String, token : String) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)
s = theme.styles[token]
else
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
s = theme.styles[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
colorized = text.colorize
s.color.try { |col| colorized = colorized.fore(col.colorize) }
# Intentionally not setting background color
colorized.mode(:bold) if s.bold
colorized.mode(:italic) if s.italic
colorized.mode(:underline) if s.underline
colorized.to_s
end
end
end

View File

@@ -1,123 +0,0 @@
require "../constants/token_abbrevs.cr"
require "../formatter"
module Tartrazine
class Html < Formatter
# property line_number_in_table : Bool = false
# property with_classes : Bool = true
property class_prefix : String = ""
property highlight_lines : Array(Range(Int32, Int32)) = [] of Range(Int32, Int32)
property line_number_id_prefix : String = "line-"
property line_number_start : Int32 = 1
property tab_width = 8
property? line_numbers : Bool = false
property? linkable_line_numbers : Bool = true
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property weight_of_bold : Int32 = 600
property theme : Theme
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), *,
@highlight_lines = [] of Range(Int32, Int32),
@class_prefix : String = "",
@line_number_id_prefix = "line-",
@line_number_start = 1,
@tab_width = 8,
@line_numbers : Bool = false,
@linkable_line_numbers : Bool = true,
@standalone : Bool = false,
@surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600)
end
def format(text : String, lexer : Lexer) : String
text = format_text(text, lexer)
if standalone?
text = wrap_standalone(text)
end
text
end
# Wrap text into a full HTML document, including the CSS for the theme
def wrap_standalone(text) : String
output = String.build do |outp|
outp << "<!DOCTYPE html><html><head><style>"
outp << style_defs
outp << "</style></head><body>"
outp << text
outp << "</body></html>"
end
output
end
def format_text(text : String, lexer : Lexer) : String
lines = lexer.group_tokens_in_lines(lexer.tokenize(text))
output = String.build do |outp|
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background")}\">"
lines.each_with_index(offset: line_number_start - 1) do |line, i|
line_label = line_numbers? ? "#{i + 1}".rjust(4).ljust(5) : ""
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
outp << "<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
line.each do |token|
fragment = "<span class=\"#{get_css_class(token[:type])}\">#{token[:value]}</span>"
outp << fragment
end
end
outp << "</code></pre>"
end
output
end
# ameba:disable Metrics/CyclomaticComplexity
def style_defs : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token)} {"
# These are set or nil
outp << "color: ##{style.color.try &.hex};" if style.color
outp << "background-color: ##{style.background.try &.hex};" if style.background
outp << "border: 1px solid ##{style.border.try &.hex};" if style.border
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline
outp << "text-decoration: none;" if style.underline == false
outp << "tab-size: #{tab_width};" if token == "Background"
outp << "}"
end
end
output
end
# Given a token type, return the CSS class to use.
def get_css_class(token : String) : String
return class_prefix + Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
class_prefix + Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
# Is this line in the highlighted ranges?
def highlighted?(line : Int) : Bool
highlight_lines.any?(&.includes?(line))
end
end
end

View File

@@ -1,11 +0,0 @@
require "../formatter"
module Tartrazine
class Json < Formatter
property name = "json"
def format(text : String, lexer : Lexer, _theme : Theme) : String
lexer.tokenize(text).to_json
end
end
end

View File

@@ -1,239 +0,0 @@
require "baked_file_system"
require "./constants/lexers"
module Tartrazine
class LexerFiles
extend BakedFileSystem
bake_folder "../lexers", __DIR__
end
# Get the lexer object for a language name
# FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : Lexer
if name.nil? && filename.nil?
lexer_file_name = LEXERS_BY_NAME["plaintext"]
elsif name && name != "autodetect"
lexer_file_name = LEXERS_BY_NAME[name.downcase]
else
# Guess by filename
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename.to_s))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
# Return a list of all lexers
def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort!
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer
property config = {
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text : String, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
text_bytes = text.to_slice
# Loop through the text, applying rules
while pos < text_bytes.size
state = states[@state_stack.last]
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text_bytes, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break
end
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
if text_bytes[pos] == 10u8
# at EOL, reset state to "root"
tokens << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
tokens << {type: "Error", value: String.new(text_bytes[pos..pos])}
end
pos += 1
end
end
Lexer.collapse_tokens(tokens)
end
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
result = [] of Tartrazine::Token
tokens = tokens.reject { |token| token[:value] == "" }
tokens.each do |token|
if result.empty?
result << token
next
end
last = result.last
if last[:type] == token[:type]
new_token = {type: last[:type], value: last[:value] + token[:value]}
result.pop
result << new_token
else
result << token
end
end
result
end
# Group tokens into lines, splitting them when a newline is found
def group_tokens_in_lines(tokens : Array(Token)) : Array(Array(Token))
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
lines = [Array(Token).new]
split_tokens.each do |token|
lines.last << token
if token[:value].includes?("\n")
lines << Array(Token).new
end
end
lines
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
node.name == "config"
}
if config
l.config = {
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
case_insensitive: xml_to_s(config, case_insensitive) == "true",
ensure_nl: xml_to_s(config, ensure_nl) == "true",
}
end
rules = lexer.children.find { |node|
node.name == "rules"
}
if rules
# Rules contains states 🤷
rules.children.select { |node|
node.name == "state"
}.each do |state_node|
state = State.new
state.name = state_node["name"]
if l.states.has_key?(state.name)
raise Exception.new("Duplicate state: #{state.name}")
else
l.states[state.name] = state
end
# And states contain rules 🤷
state_node.children.select { |node|
node.name == "rule"
}.each do |rule_node|
case rule_node["pattern"]?
when nil
if rule_node.first_element_child.try &.name == "include"
rule = IncludeStateRule.new(rule_node)
else
rule = UnconditionalRule.new(rule_node)
end
else
rule = Rule.new(rule_node,
multiline: !l.config[:not_multiline],
dotall: l.config[:dot_all],
ignorecase: l.config[:case_insensitive])
end
state.rules << rule
end
end
end
end
l
end
end
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
class State
property name : String = ""
property rules = [] of Rule
def +(other : State)
new_state = State.new
new_state.name = Random.base58(8)
new_state.rules = rules + other.rules
new_state
end
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
end

View File

@@ -1,97 +1,5 @@
require "docopt"
require "./**"
HELP = <<-HELP
tartrazine: a syntax highlighting tool
Usage:
tartrazine (-h, --help)
tartrazine FILE -f html [-t theme][--standalone][--line-numbers]
[-l lexer][-o output]
tartrazine -f html -t theme --css
tartrazine FILE -f terminal [-t theme][-l lexer][--line-numbers]
[-o output]
tartrazine FILE -f json [-o output]
tartrazine --list-themes
tartrazine --list-lexers
tartrazine --list-formatters
tartrazine --version
Options:
-f <formatter> Format to use (html, terminal, json)
-t <theme> Theme to use, see --list-themes [default: default-dark]
-l <lexer> Lexer (language) to use, see --list-lexers [default: autodetect]
-o <output> Output file. Default is stdout.
--standalone Generate a standalone HTML file, which includes
all style information. If not given, it will generate just
a HTML fragment ready to include in your own page.
--css Generate a CSS file for the theme called <theme>.css
--line-numbers Include line numbers in the output
-h, --help Show this screen
-v, --version Show version number
HELP
options = Docopt.docopt(HELP, ARGV)
# Handle version manually
if options["--version"]
puts "tartrazine #{Tartrazine::VERSION}"
exit 0
end
if options["--list-themes"]
puts Tartrazine.themes.join("\n")
exit 0
end
if options["--list-lexers"]
puts Tartrazine.lexers.join("\n")
exit 0
end
if options["--list-formatters"]
puts "html\njson\nterminal"
exit 0
end
theme = Tartrazine.theme(options["-t"].as(String))
if options["-f"]
formatter = options["-f"].as(String)
case formatter
when "html"
formatter = Tartrazine::Html.new
formatter.standalone = options["--standalone"] != nil
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "terminal"
formatter = Tartrazine::Ansi.new
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "json"
formatter = Tartrazine::Json.new
else
puts "Invalid formatter: #{formatter}"
exit 1
end
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf.puts formatter.style_defs
end
exit 0
end
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end
output = formatter.format(input, lexer)
if options["-o"].nil?
puts output
else
File.open(options["-o"].as(String), "w") do |outf|
outf.puts output
end
end
end
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme(ARGV[1])
puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)

0
src/re2.cr Normal file
View File

View File

@@ -1,9 +1,5 @@
require "./cre2/cre2"
require "./actions"
require "./bytes_regex"
require "./formatter"
require "./lexer"
require "./rules"
require "./styles"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -11,43 +7,36 @@ require "./styles"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = BytesRegex::Regex
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
class Rule
property pattern : Regex = Regex.new ""
property pattern : Re3 = Re3.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text : Bytes, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
matched, matches = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.empty? || match[0].size == 0
# p! match, String.new(text[pos..pos+20])
return false, pos, [] of Token unless matched
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
tokens += action.emit(matches, lexer)
end
Log.trace { "#{xml}, #{pos + match[0].size}, #{tokens}" }
return true, pos + match[0].size, tokens
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, matches[0].length, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
pattern = node["pattern"]
# flags = Regex::Options::ANCHORED
# MULTILINE implies DOTALL which we don't want, so we
# use in-pattern flag (?m) instead
# flags |= Regex::Options::MULTILINE if multiline
pattern = "(?m)" + pattern if multiline
# flags |= Regex::Options::DOTALL if dotall
# flags |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
@pattern = Re3.new(
node["pattern"],
multiline,
dotall,
ignorecase,
anchored: true)
add_actions(node)
end
@@ -89,7 +78,7 @@ module Tartrazine
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
actions.each do |action|
tokens += action.emit([] of Match, lexer)
tokens += action.emit(Pointer(LibCre2::StringPiece).malloc(1), lexer)
end
return true, pos, tokens
end
@@ -99,4 +88,64 @@ module Tartrazine
add_actions(node)
end
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
class Re3
@matches = Pointer(LibCre2::StringPiece).malloc(50)
@opts : LibCre2::Options
@re : LibCre2::CRe2
def group_count
LibCre2.num_capturing_groups(@re)
end
def initialize(pattern : String, multiline = false, dotall = false,
ignorecase = false, anchored = false)
@opts = LibCre2.opt_new
LibCre2.opt_posix_syntax(@opts, false)
LibCre2.opt_longest_match(@opts, false)
# These 3 are ignored when posix_syntax is false
# LibCre2.opt_one_line(@opts, !multiline)
# LibCre2.opt_perl_classes(@opts, true)
# LibCre2.opt_word_boundary(@opts, true)
LibCre2.opt_encoding(@opts, 1)
LibCre2.opt_case_sensitive(@opts, !ignorecase)
LibCre2.opt_dot_nl(@opts, dotall)
pattern = "(?m)#{pattern}" if multiline
@re = LibCre2.new(pattern, pattern.size, @opts)
end
def match(text, pos)
matched = LibCre2.match(@re, text, text.size, pos, text.size,
LibCre2::CRE2_ANCHOR_START, @matches, 50)
return {matched != 0, @matches}
end
end
end
# re2 doesn't support this (should match "x")
# re = Tartrazine::Re3.new("x(?!foo)", multiline: true, dotall: false)
# m = re.match("xfoo", 0)
# p m[0], m[1][0]

View File

@@ -1,42 +1,11 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "sixteen"
require "xml"
module Tartrazine
alias Color = Sixteen::Color
class ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
def self.theme(name : String) : Theme
begin
return Theme.from_base16(name)
rescue ex : Exception
raise ex unless ex.message.try &.includes? "Theme not found"
end
begin
Theme.from_xml(ThemeFiles.get("/#{name}.xml").gets_to_end)
rescue
raise Exception.new("Theme #{name} not found")
end
end
# Return a list of all themes
def self.themes
themes = Set(String).new
ThemeFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
Sixteen::DataFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
themes.to_a.sort!
return Theme.from_base16(name[7..]) if name.starts_with? "base16_"
path = File.join("styles", "#{name}.xml")
Theme.from_xml(File.read(path))
end
class Style
@@ -50,9 +19,9 @@ module Tartrazine
# These properties are either set or nil
# (inherit from parent style)
property background : Color?
property border : Color?
property color : Color?
property background : String?
property border : String?
property color : String?
# Styles are incomplete by default and inherit
# from parents. If this is true, this style
@@ -122,34 +91,33 @@ module Tartrazine
# The color assignments are adapted from
# https://github.com/mohd-akram/base16-pygments/
theme.styles["Background"] = Style.new(color: t["base05"], background: t["base00"], bold: true)
theme.styles["LineHighlight"] = Style.new(color: t["base0D"], background: t["base01"])
theme.styles["Text"] = Style.new(color: t["base05"])
theme.styles["Error"] = Style.new(color: t["base08"])
theme.styles["Comment"] = Style.new(color: t["base03"])
theme.styles["CommentPreproc"] = Style.new(color: t["base0F"])
theme.styles["CommentPreprocFile"] = Style.new(color: t["base0B"])
theme.styles["Keyword"] = Style.new(color: t["base0E"])
theme.styles["KeywordType"] = Style.new(color: t["base08"])
theme.styles["NameAttribute"] = Style.new(color: t["base0D"])
theme.styles["NameBuiltin"] = Style.new(color: t["base08"])
theme.styles["NameBuiltinPseudo"] = Style.new(color: t["base08"])
theme.styles["NameClass"] = Style.new(color: t["base0D"])
theme.styles["NameConstant"] = Style.new(color: t["base09"])
theme.styles["NameDecorator"] = Style.new(color: t["base09"])
theme.styles["NameFunction"] = Style.new(color: t["base0D"])
theme.styles["NameNamespace"] = Style.new(color: t["base0D"])
theme.styles["NameTag"] = Style.new(color: t["base0E"])
theme.styles["NameVariable"] = Style.new(color: t["base0D"])
theme.styles["NameVariableInstance"] = Style.new(color: t["base08"])
theme.styles["LiteralNumber"] = Style.new(color: t["base09"])
theme.styles["Operator"] = Style.new(color: t["base0C"])
theme.styles["OperatorWord"] = Style.new(color: t["base0E"])
theme.styles["Literal"] = Style.new(color: t["base0B"])
theme.styles["LiteralString"] = Style.new(color: t["base0B"])
theme.styles["LiteralStringInterpol"] = Style.new(color: t["base0F"])
theme.styles["LiteralStringRegex"] = Style.new(color: t["base0C"])
theme.styles["LiteralStringSymbol"] = Style.new(color: t["base09"])
theme.styles["Background"] = Style.new(color: t.palette["base05"], background: t.palette["base00"])
theme.styles["Text"] = Style.new(color: t.palette["base05"])
theme.styles["Error"] = Style.new(color: t.palette["base08"])
theme.styles["Comment"] = Style.new(color: t.palette["base03"])
theme.styles["CommentPreproc"] = Style.new(color: t.palette["base0F"])
theme.styles["CommentPreprocFile"] = Style.new(color: t.palette["base0B"])
theme.styles["Keyword"] = Style.new(color: t.palette["base0E"])
theme.styles["KeywordType"] = Style.new(color: t.palette["base08"])
theme.styles["NameAttribute"] = Style.new(color: t.palette["base0D"])
theme.styles["NameBuiltin"] = Style.new(color: t.palette["base08"])
theme.styles["NameBuiltinPseudo"] = Style.new(color: t.palette["base08"])
theme.styles["NameClass"] = Style.new(color: t.palette["base0D"])
theme.styles["NameConstant"] = Style.new(color: t.palette["base09"])
theme.styles["NameDecorator"] = Style.new(color: t.palette["base09"])
theme.styles["NameFunction"] = Style.new(color: t.palette["base0D"])
theme.styles["NameNamespace"] = Style.new(color: t.palette["base0D"])
theme.styles["NameTag"] = Style.new(color: t.palette["base0E"])
theme.styles["NameVariable"] = Style.new(color: t.palette["base0D"])
theme.styles["NameVariableInstance"] = Style.new(color: t.palette["base08"])
theme.styles["LiteralNumber"] = Style.new(color: t.palette["base09"])
theme.styles["Operator"] = Style.new(color: t.palette["base0C"])
theme.styles["OperatorWord"] = Style.new(color: t.palette["base0E"])
theme.styles["Literal"] = Style.new(color: t.palette["base0B"])
theme.styles["LiteralString"] = Style.new(color: t.palette["base0B"])
theme.styles["LiteralStringInterpol"] = Style.new(color: t.palette["base0F"])
theme.styles["LiteralStringRegex"] = Style.new(color: t.palette["base0C"])
theme.styles["LiteralStringSymbol"] = Style.new(color: t.palette["base09"])
theme
end
@@ -176,32 +144,13 @@ module Tartrazine
s.underline = true if style.includes?("underline")
s.underline = false if style.includes?("nounderline")
s.color = style.find(&.starts_with?("#")).try { |v| Color.new v.split("#").last }
s.background = style.find(&.starts_with?("bg:#")).try { |v| Color.new v.split("#").last }
s.border = style.find(&.starts_with?("border:#")).try { |v| Color.new v.split("#").last }
s.color = style.find(&.starts_with?("#")).try &.split("#").last
s.background = style.find(&.starts_with?("bg:#")).try &.split("#").last
s.border = style.find(&.starts_with?("border:#")).try &.split("#").last
theme.styles[node["type"]] = s
end
# We really want a LineHighlight class
if !theme.styles.has_key?("LineHighlight")
theme.styles["LineHighlight"] = Style.new
theme.styles["LineHighlight"].background = make_highlight_color(theme.styles["Background"].background)
theme.styles["LineHighlight"].bold = true
end
theme
end
# If the color is dark, make it brighter and viceversa
def self.make_highlight_color(base_color)
if base_color.nil?
# WHo knows
return Color.new(127, 127, 127)
end
if base_color.dark?
base_color.lighter(0.2)
else
base_color.darker(0.2)
end
end
end
end

View File

@@ -1,9 +1,5 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "baked_file_system"
require "base58"
require "json"
require "log"
@@ -11,9 +7,183 @@ require "xml"
module Tartrazine
extend self
VERSION = {{ `shards version #{__DIR__}`.chomp.stringify }}
VERSION = "0.1.0"
Log = ::Log.for("tartrazine")
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
class State
property name : String = ""
property rules = [] of Rule
def +(other : State)
new_state = State.new
new_state.name = Random.base58(8)
new_state.rules = rules + other.rules
new_state
end
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
class Lexer
property config = {
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
# Loop through the text, applying rules
while pos < text.size
state = states[@state_stack.last]
Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break
end
Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
Log.trace { "Error at #{pos}" }
tokens << {type: "Error", value: "#{text[pos]}"}
pos += 1
end
end
Lexer.collapse_tokens(tokens)
end
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
result = [] of Tartrazine::Token
tokens = tokens.reject { |token| token[:value] == "" }
tokens.each do |token|
if result.empty?
result << token
next
end
last = result.last
if last[:type] == token[:type]
new_token = {type: last[:type], value: last[:value] + token[:value]}
result.pop
result << new_token
else
result << token
end
end
result
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
node.name == "config"
}
if config
l.config = {
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
case_insensitive: xml_to_s(config, case_insensitive) == "true",
ensure_nl: xml_to_s(config, ensure_nl) == "true",
}
end
rules = lexer.children.find { |node|
node.name == "rules"
}
if rules
# Rules contains states 🤷
rules.children.select { |node|
node.name == "state"
}.each do |state_node|
state = State.new
state.name = state_node["name"]
if l.states.has_key?(state.name)
raise Exception.new("Duplicate state: #{state.name}")
else
l.states[state.name] = state
end
# And states contain rules 🤷
state_node.children.select { |node|
node.name == "rule"
}.each do |rule_node|
case rule_node["pattern"]?
when nil
if rule_node.first_element_child.try &.name == "include"
rule = IncludeStateRule.new(rule_node)
else
rule = UnconditionalRule.new(rule_node)
end
else
rule = Rule.new(rule_node,
multiline: !l.config[:not_multiline],
dotall: l.config[:dot_all],
ignorecase: l.config[:case_insensitive])
end
state.rules << rule
end
end
end
end
l
end
end
def self.lexer(name : String) : Lexer
Lexer.from_xml(File.read("lexers/#{name}.xml"))
end
end
# Convenience macros to parse XML

74
styles/base16-snazzy.xml Normal file
View File

@@ -0,0 +1,74 @@
<style name="base16-snazzy">
<entry type="Other" style="#e2e4e5"/>
<entry type="Error" style="#ff5c57"/>
<entry type="Background" style="bg:#282a36"/>
<entry type="Keyword" style="#ff6ac1"/>
<entry type="KeywordConstant" style="#ff6ac1"/>
<entry type="KeywordDeclaration" style="#ff5c57"/>
<entry type="KeywordNamespace" style="#ff6ac1"/>
<entry type="KeywordPseudo" style="#ff6ac1"/>
<entry type="KeywordReserved" style="#ff6ac1"/>
<entry type="KeywordType" style="#9aedfe"/>
<entry type="Name" style="#e2e4e5"/>
<entry type="NameAttribute" style="#57c7ff"/>
<entry type="NameBuiltin" style="#ff5c57"/>
<entry type="NameBuiltinPseudo" style="#e2e4e5"/>
<entry type="NameClass" style="#f3f99d"/>
<entry type="NameConstant" style="#ff9f43"/>
<entry type="NameDecorator" style="#ff9f43"/>
<entry type="NameEntity" style="#e2e4e5"/>
<entry type="NameException" style="#e2e4e5"/>
<entry type="NameFunction" style="#57c7ff"/>
<entry type="NameLabel" style="#ff5c57"/>
<entry type="NameNamespace" style="#e2e4e5"/>
<entry type="NameOther" style="#e2e4e5"/>
<entry type="NameTag" style="#ff6ac1"/>
<entry type="NameVariable" style="#ff5c57"/>
<entry type="NameVariableClass" style="#ff5c57"/>
<entry type="NameVariableGlobal" style="#ff5c57"/>
<entry type="NameVariableInstance" style="#ff5c57"/>
<entry type="Literal" style="#e2e4e5"/>
<entry type="LiteralDate" style="#e2e4e5"/>
<entry type="LiteralString" style="#5af78e"/>
<entry type="LiteralStringBacktick" style="#5af78e"/>
<entry type="LiteralStringChar" style="#5af78e"/>
<entry type="LiteralStringDoc" style="#5af78e"/>
<entry type="LiteralStringDouble" style="#5af78e"/>
<entry type="LiteralStringEscape" style="#5af78e"/>
<entry type="LiteralStringHeredoc" style="#5af78e"/>
<entry type="LiteralStringInterpol" style="#5af78e"/>
<entry type="LiteralStringOther" style="#5af78e"/>
<entry type="LiteralStringRegex" style="#5af78e"/>
<entry type="LiteralStringSingle" style="#5af78e"/>
<entry type="LiteralStringSymbol" style="#5af78e"/>
<entry type="LiteralNumber" style="#ff9f43"/>
<entry type="LiteralNumberBin" style="#ff9f43"/>
<entry type="LiteralNumberFloat" style="#ff9f43"/>
<entry type="LiteralNumberHex" style="#ff9f43"/>
<entry type="LiteralNumberInteger" style="#ff9f43"/>
<entry type="LiteralNumberIntegerLong" style="#ff9f43"/>
<entry type="LiteralNumberOct" style="#ff9f43"/>
<entry type="Operator" style="#ff6ac1"/>
<entry type="OperatorWord" style="#ff6ac1"/>
<entry type="Punctuation" style="#e2e4e5"/>
<entry type="Comment" style="#78787e"/>
<entry type="CommentHashbang" style="#78787e"/>
<entry type="CommentMultiline" style="#78787e"/>
<entry type="CommentSingle" style="#78787e"/>
<entry type="CommentSpecial" style="#78787e"/>
<entry type="CommentPreproc" style="#78787e"/>
<entry type="Generic" style="#e2e4e5"/>
<entry type="GenericDeleted" style="#ff5c57"/>
<entry type="GenericEmph" style="underline #e2e4e5"/>
<entry type="GenericError" style="#ff5c57"/>
<entry type="GenericHeading" style="bold #e2e4e5"/>
<entry type="GenericInserted" style="bold #e2e4e5"/>
<entry type="GenericOutput" style="#43454f"/>
<entry type="GenericPrompt" style="#e2e4e5"/>
<entry type="GenericStrong" style="italic #e2e4e5"/>
<entry type="GenericSubheading" style="bold #e2e4e5"/>
<entry type="GenericTraceback" style="#e2e4e5"/>
<entry type="GenericUnderline" style="underline"/>
<entry type="Text" style="#e2e4e5"/>
<entry type="TextWhitespace" style="#e2e4e5"/>
</style>