6 Commits

33 changed files with 583 additions and 16317 deletions

View File

@@ -1,5 +1,5 @@
# This configuration file was generated by `ameba --gen-config`
# on 2024-08-12 22:00:49 UTC using Ameba version 1.6.1.
# on 2024-08-04 23:09:09 UTC using Ameba version 1.6.1.
# The point is for the user to remove these configuration records
# one by one as the reported problems are removed from the code base.
@@ -9,7 +9,7 @@ Documentation/DocumentationAdmonition:
Description: Reports documentation admonitions
Timezone: UTC
Excluded:
- src/lexer.cr
- src/tartrazine.cr
- src/actions.cr
Admonitions:
- TODO
@@ -17,105 +17,3 @@ Documentation/DocumentationAdmonition:
- BUG
Enabled: true
Severity: Warning
# Problems found: 22
# Run `ameba --only Lint/MissingBlockArgument` for details
Lint/MissingBlockArgument:
Description: Disallows yielding method definitions without block argument
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/NotNil` for details
Lint/NotNil:
Description: Identifies usage of `not_nil!` calls
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 34
# Run `ameba --only Lint/ShadowingOuterLocalVar` for details
Lint/ShadowingOuterLocalVar:
Description: Disallows the usage of the same name as outer local variables for block
or proc arguments
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 1
# Run `ameba --only Lint/UnreachableCode` for details
Lint/UnreachableCode:
Description: Reports unreachable code
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 6
# Run `ameba --only Lint/UselessAssign` for details
Lint/UselessAssign:
Description: Disallows useless variable assignments
ExcludeTypeDeclarations: false
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Warning
# Problems found: 3
# Run `ameba --only Naming/BlockParameterName` for details
Naming/BlockParameterName:
Description: Disallows non-descriptive block parameter names
MinNameLength: 3
AllowNamesEndingInNumbers: true
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- _
- e
- i
- j
- k
- v
- x
- y
- ex
- io
- ws
- op
- tx
- id
- ip
- k1
- k2
- v1
- v2
ForbiddenNames: []
Enabled: true
Severity: Convention
# Problems found: 1
# Run `ameba --only Naming/RescuedExceptionsVariableName` for details
Naming/RescuedExceptionsVariableName:
Description: Makes sure that rescued exceptions variables are named as expected
Excluded:
- pygments/tests/examplefiles/cr/test.cr
AllowedNames:
- e
- ex
- exception
- error
Enabled: true
Severity: Convention
# Problems found: 6
# Run `ameba --only Naming/TypeNames` for details
Naming/TypeNames:
Description: Enforces type names in camelcase manner
Excluded:
- pygments/tests/examplefiles/cr/test.cr
Enabled: true
Severity: Convention

3
.gitignore vendored
View File

@@ -6,6 +6,3 @@
chroma/
pygments/
shard.lock
.vscode/
.crystal/
venv/

View File

@@ -1,15 +0,0 @@
FROM --platform=${TARGETPLATFORM:-linux/amd64} alpine:3.20 AS build
RUN apk add --no-cache \
crystal \
shards \
yaml-dev \
yaml-static \
openssl-dev \
openssl-libs-static \
libxml2-dev \
libxml2-static \
zlib-dev \
zlib-static \
xz-dev \
xz-static \
make

View File

@@ -1,7 +0,0 @@
build: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build -Dstrict_multi_assign -Dno_number_autocast -d --error-trace
release: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release
static: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release --static
strip bin/tartrazine

View File

@@ -4,17 +4,17 @@ Tartrazine is a library to syntax-highlight code. It is
a port of [Pygments](https://pygments.org/) to
[Crystal](https://crystal-lang.org/). Kind of.
The CLI tool can be used to highlight many things in many styles.
It's not currently usable because it's not finished, but:
* The lexers work for the implemented languages
* The provided styles work
* There is a very very simple HTML formatter
# A port of what? Why "kind of"?
Pygments is a staple of the Python ecosystem, and it's great.
It lets you highlight code in many languages, and it has many
themes. Chroma is "Pygments for Go", it's actually a port of
Pygments to Go, and it's great too.
I wanted that in Crystal, so I started this project. But I did
not read much of the Pygments code. Or much of Chroma's.
Because I did not read the Pygments code. And this is actually
based on [Chroma](https://github.com/alecthomas/chroma) ...
although I did not read that code either.
Chroma has taken most of the Pygments lexers and turned them into
XML descriptions. What I did was take those XML files from Chroma
@@ -31,30 +31,16 @@ is a subset of Pygments'.
Currently Tartrazine supports ... 241 languages.
It has 331 themes (63 from Chroma, the rest are base16 themes via
It has 332 themes (64 from Chroma, the rest are base16 themes via
[Sixteen](https://github.com/ralsina/sixteen)
## Installation
From prebuilt binaries:
This will have a CLI tool that can be installed, but it's not
there yet.
Each release provides statically-linked binaries that should
work on any Linux. Get them from the [releases page](https://github.com/ralsina/tartrazine/releases) and put them in your PATH.
To build from source:
1. Clone this repo
2. Run `make` to build the `tartrazine` binary
3. Copy the binary somewhere in your PATH.
## Usage as a CLI tool
```shell
$ tartrazine whatever.c -l c -t catppuccin-macchiato --line-numbers \
--standalone -o whatever.html
```
## Usage as a Library
## Usage
This works:
@@ -63,9 +49,7 @@ require "tartrazine"
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme("catppuccin-macchiato")
formatter = Tartrazine::Html.new
formatter.theme = theme
puts formatter.format(File.read(ARGV[0]), lexer)
puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)
```
## Contributing

12
TODO.md
View File

@@ -2,12 +2,6 @@
## TODO
* Implement styles
* Implement formatters
* Implement CLI
* ✅ Implement lexer loader that respects aliases
* ✅ Implement lexer loader by file extension
* ✅ Add --line-numbers to terminal formatter
* Implement lexer loader by mime type
* ✅ Implement Delegating lexers
* Add RstLexer maybe others?
* Implement styles
* Implement formatters
* Implement lexer loader that respects aliases, etc

View File

@@ -1,16 +0,0 @@
#!/bin/bash
set -e
docker run --rm --privileged \
multiarch/qemu-user-static \
--reset -p yes
# Build for AMD64
docker build . -f Dockerfile.static -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-amd64
# Build for ARM64
docker build . -f Dockerfile.static --platform linux/arm64 -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --platform linux/arm64 --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-arm64

View File

@@ -1,130 +0,0 @@
<lexer>
<config>
<name>liquid</name>
<alias>liquid</alias>
<filename>*.liquid</filename>
</config>
<rules>
<state name="root">
<rule pattern="[^{]+"><token type="Text"/></rule>
<rule pattern="(\{%)(\s*)"><bygroups><token type="Punctuation"/><token type="TextWhitespace"/></bygroups><push state="tag-or-block"/></rule>
<rule pattern="(\{\{)(\s*)([^\s}]+)"><bygroups><token type="Punctuation"/><token type="TextWhitespace"/><usingself state="generic"/></bygroups><push state="output"/></rule>
<rule pattern="\{"><token type="Text"/></rule>
</state>
<state name="tag-or-block">
<rule pattern="(if|unless|elsif|case)(?=\s+)"><token type="KeywordReserved"/><push state="condition"/></rule>
<rule pattern="(when)(\s+)"><bygroups><token type="KeywordReserved"/><token type="TextWhitespace"/></bygroups><combined state="end-of-block" state="whitespace" state="generic"/></rule>
<rule pattern="(else)(\s*)(%\})"><bygroups><token type="KeywordReserved"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><pop depth="1"/></rule>
<rule pattern="(capture)(\s+)([^\s%]+)(\s*)(%\})"><bygroups><token type="NameTag"/><token type="TextWhitespace"/><usingself state="variable"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><pop depth="1"/></rule>
<rule pattern="(comment)(\s*)(%\})"><bygroups><token type="NameTag"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><push state="comment"/></rule>
<rule pattern="(raw)(\s*)(%\})"><bygroups><token type="NameTag"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><push state="raw"/></rule>
<rule pattern="(end(case|unless|if))(\s*)(%\})"><bygroups><token type="KeywordReserved"/>None<token type="TextWhitespace"/><token type="Punctuation"/></bygroups><pop depth="1"/></rule>
<rule pattern="(end([^\s%]+))(\s*)(%\})"><bygroups><token type="NameTag"/>None<token type="TextWhitespace"/><token type="Punctuation"/></bygroups><pop depth="1"/></rule>
<rule pattern="(cycle)(\s+)(?:([^\s:]*)(:))?(\s*)"><bygroups><token type="NameTag"/><token type="TextWhitespace"/><usingself state="generic"/><token type="Punctuation"/><token type="TextWhitespace"/></bygroups><push state="variable-tag-markup"/></rule>
<rule pattern="([^\s%]+)(\s*)"><bygroups><token type="NameTag"/><token type="TextWhitespace"/></bygroups><push state="tag-markup"/></rule>
</state>
<state name="output">
<rule><include state="whitespace"/></rule>
<rule pattern="\}\}"><token type="Punctuation"/><pop depth="1"/></rule>
<rule pattern="\|"><token type="Punctuation"/><push state="filters"/></rule>
</state>
<state name="filters">
<rule><include state="whitespace"/></rule>
<rule pattern="\}\}"><token type="Punctuation"/><push state="#pop" state="#pop"/></rule>
<rule pattern="([^\s|:]+)(:?)(\s*)"><bygroups><token type="NameFunction"/><token type="Punctuation"/><token type="TextWhitespace"/></bygroups><push state="filter-markup"/></rule>
</state>
<state name="filter-markup">
<rule pattern="\|"><token type="Punctuation"/><pop depth="1"/></rule>
<rule><include state="end-of-tag"/></rule>
<rule><include state="default-param-markup"/></rule>
</state>
<state name="condition">
<rule><include state="end-of-block"/></rule>
<rule><include state="whitespace"/></rule>
<rule pattern="([^\s=!&gt;&lt;]+)(\s*)([=!&gt;&lt;]=?)(\s*)(\S+)(\s*)(%\})"><bygroups><usingself state="generic"/><token type="TextWhitespace"/><token type="Operator"/><token type="TextWhitespace"/><usingself state="generic"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups></rule>
<rule pattern="\b!"><token type="Operator"/></rule>
<rule pattern="\bnot\b"><token type="OperatorWord"/></rule>
<rule pattern="([\w.\&#x27;&quot;]+)(\s+)(contains)(\s+)([\w.\&#x27;&quot;]+)"><bygroups><usingself state="generic"/><token type="TextWhitespace"/><token type="OperatorWord"/><token type="TextWhitespace"/><usingself state="generic"/></bygroups></rule>
<rule><include state="generic"/></rule>
<rule><include state="whitespace"/></rule>
</state>
<state name="generic-value">
<rule><include state="generic"/></rule>
<rule><include state="end-at-whitespace"/></rule>
</state>
<state name="operator">
<rule pattern="(\s*)((=|!|&gt;|&lt;)=?)(\s*)"><bygroups><token type="TextWhitespace"/><token type="Operator"/>None<token type="TextWhitespace"/></bygroups><pop depth="1"/></rule>
<rule pattern="(\s*)(\bcontains\b)(\s*)"><bygroups><token type="TextWhitespace"/><token type="OperatorWord"/><token type="TextWhitespace"/></bygroups><pop depth="1"/></rule>
</state>
<state name="end-of-tag">
<rule pattern="\}\}"><token type="Punctuation"/><pop depth="1"/></rule>
</state>
<state name="end-of-block">
<rule pattern="%\}"><token type="Punctuation"/><push state="#pop" state="#pop"/></rule>
</state>
<state name="end-at-whitespace">
<rule pattern="\s+"><token type="TextWhitespace"/><pop depth="1"/></rule>
</state>
<state name="param-markup">
<rule><include state="whitespace"/></rule>
<rule pattern="([^\s=:]+)(\s*)(=|:)"><bygroups><token type="NameAttribute"/><token type="TextWhitespace"/><token type="Operator"/></bygroups></rule>
<rule pattern="(\{\{)(\s*)([^\s}])(\s*)(\}\})"><bygroups><token type="Punctuation"/><token type="TextWhitespace"/><usingself state="variable"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups></rule>
<rule><include state="string"/></rule>
<rule><include state="number"/></rule>
<rule><include state="keyword"/></rule>
<rule pattern=","><token type="Punctuation"/></rule>
</state>
<state name="default-param-markup">
<rule><include state="param-markup"/></rule>
<rule pattern="."><token type="Text"/></rule>
</state>
<state name="variable-param-markup">
<rule><include state="param-markup"/></rule>
<rule><include state="variable"/></rule>
<rule pattern="."><token type="Text"/></rule>
</state>
<state name="tag-markup">
<rule pattern="%\}"><token type="Punctuation"/><push state="#pop" state="#pop"/></rule>
<rule><include state="default-param-markup"/></rule>
</state>
<state name="variable-tag-markup">
<rule pattern="%\}"><token type="Punctuation"/><push state="#pop" state="#pop"/></rule>
<rule><include state="variable-param-markup"/></rule>
</state>
<state name="keyword">
<rule pattern="\b(false|true)\b"><token type="KeywordConstant"/></rule>
</state>
<state name="variable">
<rule pattern="[a-zA-Z_]\w*"><token type="NameVariable"/></rule>
<rule pattern="(?&lt;=\w)\.(?=\w)"><token type="Punctuation"/></rule>
</state>
<state name="string">
<rule pattern="&#x27;[^&#x27;]*&#x27;"><token type="LiteralStringSingle"/></rule>
<rule pattern="&quot;[^&quot;]*&quot;"><token type="LiteralStringDouble"/></rule>
</state>
<state name="number">
<rule pattern="\d+\.\d+"><token type="LiteralNumberFloat"/></rule>
<rule pattern="\d+"><token type="LiteralNumberInteger"/></rule>
</state>
<state name="generic">
<rule><include state="keyword"/></rule>
<rule><include state="string"/></rule>
<rule><include state="number"/></rule>
<rule><include state="variable"/></rule>
</state>
<state name="whitespace">
<rule pattern="[ \t]+"><token type="TextWhitespace"/></rule>
</state>
<state name="comment">
<rule pattern="(\{%)(\s*)(endcomment)(\s*)(%\})"><bygroups><token type="Punctuation"/><token type="TextWhitespace"/><token type="NameTag"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><push state="#pop" state="#pop"/></rule>
<rule pattern="."><token type="Comment"/></rule>
</state>
<state name="raw">
<rule pattern="[^{]+"><token type="Text"/></rule>
<rule pattern="(\{%)(\s*)(endraw)(\s*)(%\})"><bygroups><token type="Punctuation"/><token type="TextWhitespace"/><token type="NameTag"/><token type="TextWhitespace"/><token type="Punctuation"/></bygroups><pop depth="1"/></rule>
<rule pattern="\{"><token type="Text"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,47 +0,0 @@
<lexer>
<config>
<name>Twig</name>
<alias>twig</alias>
<mime_type>application/x-twig</mime_type>
<dot_all>true</dot_all>
</config>
<rules>
<state name="root">
<rule pattern="[^{]+"><token type="Other"/></rule>
<rule pattern="\{\{"><token type="CommentPreproc"/><push state="var"/></rule>
<rule pattern="\{\#.*?\#\}"><token type="Comment"/></rule>
<rule pattern="(\{%)(-?\s*)(raw)(\s*-?)(%\})(.*?)(\{%)(-?\s*)(endraw)(\s*-?)(%\})"><bygroups><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="CommentPreproc"/><token type="Other"/><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="CommentPreproc"/></bygroups></rule>
<rule pattern="(\{%)(-?\s*)(verbatim)(\s*-?)(%\})(.*?)(\{%)(-?\s*)(endverbatim)(\s*-?)(%\})"><bygroups><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="CommentPreproc"/><token type="Other"/><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="CommentPreproc"/></bygroups></rule>
<rule pattern="(\{%)(-?\s*)(filter)(\s+)((?:[\\_a-z]|[^\x00-\x7f])(?:[\\\w-]|[^\x00-\x7f])*)"><bygroups><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="NameFunction"/></bygroups><push state="tag"/></rule>
<rule pattern="(\{%)(-?\s*)([a-zA-Z_]\w*)"><bygroups><token type="CommentPreproc"/><token type="Text"/><token type="Keyword"/></bygroups><push state="tag"/></rule>
<rule pattern="\{"><token type="Other"/></rule>
</state>
<state name="varnames">
<rule pattern="(\|)(\s*)((?:[\\_a-z]|[^\x00-\x7f])(?:[\\\w-]|[^\x00-\x7f])*)"><bygroups><token type="Operator"/><token type="Text"/><token type="NameFunction"/></bygroups></rule>
<rule pattern="(is)(\s+)(not)?(\s*)((?:[\\_a-z]|[^\x00-\x7f])(?:[\\\w-]|[^\x00-\x7f])*)"><bygroups><token type="Keyword"/><token type="Text"/><token type="Keyword"/><token type="Text"/><token type="NameFunction"/></bygroups></rule>
<rule pattern="(?i)(true|false|none|null)\b"><token type="KeywordPseudo"/></rule>
<rule pattern="(in|not|and|b-and|or|b-or|b-xor|isif|elseif|else|importconstant|defined|divisibleby|empty|even|iterable|odd|sameasmatches|starts\s+with|ends\s+with)\b"><token type="Keyword"/></rule>
<rule pattern="(loop|block|parent)\b"><token type="NameBuiltin"/></rule>
<rule pattern="(?:[\\_a-z]|[^\x00-\x7f])(?:[\\\w-]|[^\x00-\x7f])*"><token type="NameVariable"/></rule>
<rule pattern="\.(?:[\\_a-z]|[^\x00-\x7f])(?:[\\\w-]|[^\x00-\x7f])*"><token type="NameVariable"/></rule>
<rule pattern="\.[0-9]+"><token type="LiteralNumber"/></rule>
<rule pattern=":?&quot;(\\\\|\\[^\\]|[^&quot;\\])*&quot;"><token type="LiteralStringDouble"/></rule>
<rule pattern=":?&#x27;(\\\\|\\[^\\]|[^&#x27;\\])*&#x27;"><token type="LiteralStringSingle"/></rule>
<rule pattern="([{}()\[\]+\-*/,:~%]|\.\.|\?|:|\*\*|\/\/|!=|[&gt;&lt;=]=?)"><token type="Operator"/></rule>
<rule pattern="[0-9](\.[0-9]*)?(eE[+-][0-9])?[flFLdD]?|0[xX][0-9a-fA-F]+[Ll]?"><token type="LiteralNumber"/></rule>
</state>
<state name="var">
<rule pattern="\s+"><token type="Text"/></rule>
<rule pattern="(-?)(\}\})"><bygroups><token type="Text"/><token type="CommentPreproc"/></bygroups><pop depth="1"/></rule>
<rule><include state="varnames"/></rule>
</state>
<state name="tag">
<rule pattern="\s+"><token type="Text"/></rule>
<rule pattern="(-?)(%\})"><bygroups><token type="Text"/><token type="CommentPreproc"/></bygroups><pop depth="1"/></rule>
<rule><include state="varnames"/></rule>
<rule pattern="."><token type="Punctuation"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,55 +0,0 @@
<lexer>
<config>
<name>Velocity</name>
<alias>velocity</alias>
<filename>*.vm</filename>
<filename>*.fhtml</filename>
<dot_all>true</dot_all>
</config>
<rules>
<state name="root">
<rule pattern="[^{#$]+"><token type="Other"/></rule>
<rule pattern="(#)(\*.*?\*)(#)"><bygroups><token type="CommentPreproc"/><token type="Comment"/><token type="CommentPreproc"/></bygroups></rule>
<rule pattern="(##)(.*?$)"><bygroups><token type="CommentPreproc"/><token type="Comment"/></bygroups></rule>
<rule pattern="(#\{?)([a-zA-Z_]\w*)(\}?)(\s?\()"><bygroups><token type="CommentPreproc"/><token type="NameFunction"/><token type="CommentPreproc"/><token type="Punctuation"/></bygroups><push state="directiveparams"/></rule>
<rule pattern="(#\{?)([a-zA-Z_]\w*)(\}|\b)"><bygroups><token type="CommentPreproc"/><token type="NameFunction"/><token type="CommentPreproc"/></bygroups></rule>
<rule pattern="\$!?\{?"><token type="Punctuation"/><push state="variable"/></rule>
</state>
<state name="variable">
<rule pattern="[a-zA-Z_]\w*"><token type="NameVariable"/></rule>
<rule pattern="\("><token type="Punctuation"/><push state="funcparams"/></rule>
<rule pattern="(\.)([a-zA-Z_]\w*)"><bygroups><token type="Punctuation"/><token type="NameVariable"/></bygroups><push/></rule>
<rule pattern="\}"><token type="Punctuation"/><pop depth="1"/></rule>
<rule><pop depth="1"/></rule>
</state>
<state name="directiveparams">
<rule pattern="(&amp;&amp;|\|\||==?|!=?|[-&lt;&gt;+*%&amp;|^/])|\b(eq|ne|gt|lt|ge|le|not|in)\b"><token type="Operator"/></rule>
<rule pattern="\["><token type="Operator"/><push state="rangeoperator"/></rule>
<rule pattern="\b[a-zA-Z_]\w*\b"><token type="NameFunction"/></rule>
<rule><include state="funcparams"/></rule>
</state>
<state name="rangeoperator">
<rule pattern="\.\."><token type="Operator"/></rule>
<rule><include state="funcparams"/></rule>
<rule pattern="\]"><token type="Operator"/><pop depth="1"/></rule>
</state>
<state name="funcparams">
<rule pattern="\$!?\{?"><token type="Punctuation"/><push state="variable"/></rule>
<rule pattern="\s+"><token type="Text"/></rule>
<rule pattern="[,:]"><token type="Punctuation"/></rule>
<rule pattern="&quot;(\\\\|\\[^\\]|[^&quot;\\])*&quot;"><token type="LiteralStringDouble"/></rule>
<rule pattern="&#x27;(\\\\|\\[^\\]|[^&#x27;\\])*&#x27;"><token type="LiteralStringSingle"/></rule>
<rule pattern="0[xX][0-9a-fA-F]+[Ll]?"><token type="LiteralNumber"/></rule>
<rule pattern="\b[0-9]+\b"><token type="LiteralNumber"/></rule>
<rule pattern="(true|false|null)\b"><token type="KeywordConstant"/></rule>
<rule pattern="\("><token type="Punctuation"/><push/></rule>
<rule pattern="\)"><token type="Punctuation"/><pop depth="1"/></rule>
<rule pattern="\{"><token type="Punctuation"/><push/></rule>
<rule pattern="\}"><token type="Punctuation"/><pop depth="1"/></rule>
<rule pattern="\["><token type="Punctuation"/><push/></rule>
<rule pattern="\]"><token type="Punctuation"/><pop depth="1"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,22 +0,0 @@
<lexer>
<config>
<name>BBCode</name>
<alias>bbcode</alias>
<mime_type>text/x-bbcode</mime_type>
</config>
<rules>
<state name="root">
<rule pattern="[^[]+"><token type="Text"/></rule>
<rule pattern="\[/?\w+"><token type="Keyword"/><push state="tag"/></rule>
<rule pattern="\["><token type="Text"/></rule>
</state>
<state name="tag">
<rule pattern="\s+"><token type="Text"/></rule>
<rule pattern="(\w+)(=)(&quot;?[^\s&quot;\]]+&quot;?)"><bygroups><token type="NameAttribute"/><token type="Operator"/><token type="LiteralString"/></bygroups></rule>
<rule pattern="(=)(&quot;?[^\s&quot;\]]+&quot;?)"><bygroups><token type="Operator"/><token type="LiteralString"/></bygroups></rule>
<rule pattern="\]"><token type="Keyword"/><pop depth="1"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,56 +0,0 @@
<lexer>
<config>
<name>Markdown</name>
<alias>markdown</alias>
<alias>md</alias>
<filename>*.md</filename>
<filename>*.markdown</filename>
<mime_type>text/x-markdown</mime_type>
</config>
<rules>
<state name="root">
<rule pattern="(^#[^#].+)(\n)"><bygroups><token type="GenericHeading"/><token type="Text"/></bygroups></rule>
<rule pattern="(^#{2,6}[^#].+)(\n)"><bygroups><token type="GenericSubheading"/><token type="Text"/></bygroups></rule>
<rule pattern="^(.+)(\n)(=+)(\n)"><bygroups><token type="GenericHeading"/><token type="Text"/><token type="GenericHeading"/><token type="Text"/></bygroups></rule>
<rule pattern="^(.+)(\n)(-+)(\n)"><bygroups><token type="GenericSubheading"/><token type="Text"/><token type="GenericSubheading"/><token type="Text"/></bygroups></rule>
<rule pattern="^(\s*)([*-] )(\[[ xX]\])( .+\n)"><bygroups><token type="TextWhitespace"/><token type="Keyword"/><token type="Keyword"/><usingself state="inline"/></bygroups></rule>
<rule pattern="^(\s*)([*-])(\s)(.+\n)"><bygroups><token type="TextWhitespace"/><token type="Keyword"/><token type="TextWhitespace"/><usingself state="inline"/></bygroups></rule>
<rule pattern="^(\s*)([0-9]+\.)( .+\n)"><bygroups><token type="TextWhitespace"/><token type="Keyword"/><usingself state="inline"/></bygroups></rule>
<rule pattern="^(\s*&gt;\s)(.+\n)"><bygroups><token type="Keyword"/><token type="GenericEmph"/></bygroups></rule>
<rule pattern="^(```\n)([\w\W]*?)(^```$)">
<bygroups>
<token type="LiteralStringBacktick"/>
<token type="Text"/>
<token type="LiteralStringBacktick"/>
</bygroups>
</rule>
<rule pattern="^(```)(\w+)(\n)([\w\W]*?)(^```$)">
<bygroups>
<token type="LiteralStringBacktick"/>
<token type="NameLabel"/>
<token type="TextWhitespace"/>
<UsingByGroup lexer="2" content="4"/>
<token type="LiteralStringBacktick"/>
</bygroups>
</rule>
<rule><include state="inline"/></rule>
</state>
<state name="inline">
<rule pattern="\\."><token type="Text"/></rule>
<rule pattern="([^`]?)(`[^`\n]+`)"><bygroups><token type="Text"/><token type="LiteralStringBacktick"/></bygroups></rule>
<rule pattern="([^\*]?)(\*\*[^* \n][^*\n]*\*\*)"><bygroups><token type="Text"/><token type="GenericStrong"/></bygroups></rule>
<rule pattern="([^_]?)(__[^_ \n][^_\n]*__)"><bygroups><token type="Text"/><token type="GenericStrong"/></bygroups></rule>
<rule pattern="([^\*]?)(\*[^* \n][^*\n]*\*)"><bygroups><token type="Text"/><token type="GenericEmph"/></bygroups></rule>
<rule pattern="([^_]?)(_[^_ \n][^_\n]*_)"><bygroups><token type="Text"/><token type="GenericEmph"/></bygroups></rule>
<rule pattern="([^~]?)(~~[^~ \n][^~\n]*~~)"><bygroups><token type="Text"/><token type="GenericDeleted"/></bygroups></rule>
<rule pattern="[@#][\w/:]+"><token type="NameEntity"/></rule>
<rule pattern="(!?\[)([^]]+)(\])(\()([^)]+)(\))"><bygroups><token type="Text"/><token type="NameTag"/><token type="Text"/><token type="Text"/><token type="NameAttribute"/><token type="Text"/></bygroups></rule>
<rule pattern="(\[)([^]]+)(\])(\[)([^]]*)(\])"><bygroups><token type="Text"/><token type="NameTag"/><token type="Text"/><token type="Text"/><token type="NameLabel"/><token type="Text"/></bygroups></rule>
<rule pattern="^(\s*\[)([^]]*)(\]:\s*)(.+)"><bygroups><token type="Text"/><token type="NameLabel"/><token type="Text"/><token type="NameAttribute"/></bygroups></rule>
<rule pattern="[^\\\s]+"><token type="Text"/></rule>
<rule pattern="."><token type="Text"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,34 +0,0 @@
<lexer>
<config>
<name>MoinMoin/Trac Wiki markup</name>
<alias>trac-wiki</alias>
<alias>moin</alias>
<mime_type>text/x-trac-wiki</mime_type>
<case_insensitive>true</case_insensitive>
</config>
<rules>
<state name="root">
<rule pattern="^#.*$"><token type="Comment"/></rule>
<rule pattern="(!)(\S+)"><bygroups><token type="Keyword"/><token type="Text"/></bygroups></rule>
<rule pattern="^(=+)([^=]+)(=+)(\s*#.+)?$"><bygroups><token type="GenericHeading"/><usingself state="root"/><token type="GenericHeading"/><token type="LiteralString"/></bygroups></rule>
<rule pattern="(\{\{\{)(\n#!.+)?"><bygroups><token type="NameBuiltin"/><token type="NameNamespace"/></bygroups><push state="codeblock"/></rule>
<rule pattern="(\&#x27;\&#x27;\&#x27;?|\|\||`|__|~~|\^|,,|::)"><token type="Comment"/></rule>
<rule pattern="^( +)([.*-])( )"><bygroups><token type="Text"/><token type="NameBuiltin"/><token type="Text"/></bygroups></rule>
<rule pattern="^( +)([a-z]{1,5}\.)( )"><bygroups><token type="Text"/><token type="NameBuiltin"/><token type="Text"/></bygroups></rule>
<rule pattern="\[\[\w+.*?\]\]"><token type="Keyword"/></rule>
<rule pattern="(\[[^\s\]]+)(\s+[^\]]+?)?(\])"><bygroups><token type="Keyword"/><token type="LiteralString"/><token type="Keyword"/></bygroups></rule>
<rule pattern="^----+$"><token type="Keyword"/></rule>
<rule pattern="[^\n\&#x27;\[{!_~^,|]+"><token type="Text"/></rule>
<rule pattern="\n"><token type="Text"/></rule>
<rule pattern="."><token type="Text"/></rule>
</state>
<state name="codeblock">
<rule pattern="\}\}\}"><token type="NameBuiltin"/><pop depth="1"/></rule>
<rule pattern="\{\{\{"><token type="Text"/><push/></rule>
<rule pattern="[^{}]+"><token type="CommentPreproc"/></rule>
<rule pattern="."><token type="CommentPreproc"/></rule>
</state>
</rules>
</lexer>

View File

@@ -1,57 +0,0 @@
# This script parses the metadata of all the lexers and generates
# a datafile with all the information so we don't have to instantiate
# all the lexers to get the information.
import glob
from collections import defaultdict
lexer_by_name = {}
lexer_by_mimetype = defaultdict(set)
lexer_by_filename = defaultdict(set)
for fname in glob.glob("lexers/*.xml"):
aliases = set([])
mimetypes = set([])
filenames = set([])
print(fname)
with open(fname) as f:
lexer_name = fname.split("/")[-1].split(".")[0]
for line in f:
if "</config" in line:
break
if "<filename>" in line:
filenames.add(line.split(">")[1].split("<")[0].lower())
if "<mime_type>" in line:
mimetypes.add(line.split(">")[1].split("<")[0].lower())
if "<alias>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
if "<name>" in line:
aliases.add(line.split(">")[1].split("<")[0].lower())
for alias in aliases:
if alias in lexer_by_name and alias != lexer_by_name[alias]:
raise Exception(f"Alias {alias} already in use by {lexer_by_name[alias]}")
lexer_by_name[alias] = lexer_name
for mimetype in mimetypes:
lexer_by_mimetype[mimetype] = lexer_name
for filename in filenames:
lexer_by_filename[filename].add(lexer_name)
with open("src/constants/lexers.cr", "w") as f:
f.write("module Tartrazine\n")
f.write(" LEXERS_BY_NAME = {\n")
for k in sorted(lexer_by_name.keys()):
v = lexer_by_name[k]
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_MIMETYPE = {\n")
for k in sorted(lexer_by_mimetype.keys()):
v = lexer_by_mimetype[k]
f.write(f'"{k}" => "{v}", \n')
f.write("}\n")
f.write(" LEXERS_BY_FILENAME = {\n")
for k in sorted(lexer_by_filename.keys()):
v = lexer_by_filename[k]
f.write(f'"{k}" => {str(list(v)).replace("'", "\"")}, \n')
f.write("}\n")
f.write("end\n")

View File

@@ -1,55 +1,24 @@
# Script to generate abbreviations for tokens. Parses all lexers
# and styles files to find all token names and generate a unique
# abbreviation for each one. The abbreviations are generated by
# taking the uppercase letters of the token name and converting
# them to lowercase. If the abbreviation is not unique, the script
# will print a warning and exit.
import sys
import string
import glob
tokens = {"Highlight"}
abbrevs = {"Highlight": "hl"}
# Run it as grep token lexers/* | python scripts/token_abbrevs.py
def abbr(line):
return "".join(c for c in line if c in string.ascii_uppercase).lower()
def check_abbrevs():
if len(abbrevs) != len(tokens):
print("Warning: Abbreviations are not unique")
print(len(abbrevs), len(tokens))
sys.exit(1)
abbrevs = {}
tokens = set([])
for line in sys.stdin:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
# Processes all files in lexers looking for token names
for fname in glob.glob("lexers/*.xml"):
with open(fname) as f:
for line in f:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
check_abbrevs()
# Processes all files in styles looking for token names too
for fname in glob.glob("styles/*.xml"):
with open(fname) as f:
for line in f:
if "<entry" not in line:
continue
line = line.strip()
line = line.split('type=',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
check_abbrevs()
with open ("src/constants/token_abbrevs.cr", "w") as outf:
outf.write("module Tartrazine\n")
outf.write(" Abbreviations = {\n")
for k in sorted(abbrevs.keys()):
outf.write(f' "{k}" => "{abbrevs[k]}",\n')
outf.write(" }\nend\n")
print("Abbreviations: {")
for k, v in abbrevs.items():
print(f' "{k}" => "{v}",')
print("}")

View File

@@ -1,5 +1,5 @@
name: tartrazine
version: 0.6.0
version: 0.1.0
authors:
- Roberto Alsina <roberto.alsina@gmail.com>
@@ -9,14 +9,13 @@ targets:
main: src/main.cr
dependencies:
baked_file_system:
github: schovi/baked_file_system
base58:
github: crystal-china/base58.cr
sixteen:
github: ralsina/sixteen
docopt:
github: chenkovsky/docopt.cr
branch: main
cre2:
git: "https://git.ralsina.me/ralsina/cre2.git"
crystal: ">= 1.13.0"

View File

@@ -14,18 +14,15 @@ unicode_problems = {
"#{__DIR__}/tests/java/test_string_literals.txt",
"#{__DIR__}/tests/json/test_strings.txt",
"#{__DIR__}/tests/systemd/example1.txt",
"#{__DIR__}/tests/c++/test_unicode_identifiers.txt",
}
# These testcases fail because of differences in the way chroma and tartrazine tokenize
# but tartrazine is correct
bad_in_chroma = {
"#{__DIR__}/tests/bash_session/test_comment_after_prompt.txt",
"#{__DIR__}/tests/html/javascript_backtracking.txt",
"#{__DIR__}/tests/java/test_default.txt",
"#{__DIR__}/tests/java/test_multiline_string.txt",
"#{__DIR__}/tests/java/test_numeric_literals.txt",
"#{__DIR__}/tests/octave/test_multilinecomment.txt",
"#{__DIR__}/tests/php/test_string_escaping_run.txt",
"#{__DIR__}/tests/python_2/test_cls_builtin.txt",
}
@@ -33,14 +30,19 @@ bad_in_chroma = {
known_bad = {
"#{__DIR__}/tests/bash_session/fake_ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/prompt_in_output.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_ps2.txt",
"#{__DIR__}/tests/bash_session/ps2_prompt.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_ls_no_ps2.txt",
"#{__DIR__}/tests/bash_session/test_virtualenv.txt",
"#{__DIR__}/tests/bash_session/test_newline_in_echo_ps2.txt",
"#{__DIR__}/tests/c/test_string_resembling_decl_end.txt",
"#{__DIR__}/tests/html/css_backtracking.txt",
"#{__DIR__}/tests/mcfunction/data.txt",
"#{__DIR__}/tests/mcfunction/selectors.txt",
"#{__DIR__}/tests/php/anonymous_class.txt",
"#{__DIR__}/tests/html/javascript_unclosed.txt",
}
# Tests that fail because of a limitation in PCRE2
@@ -72,8 +74,8 @@ end
# Helper that creates lexer and tokenizes
def tokenize(lexer_name, text)
tokenizer = Tartrazine.lexer(lexer_name).tokenizer(text)
Tartrazine::Lexer.collapse_tokens(tokenizer.to_a)
lexer = Tartrazine.lexer(lexer_name)
lexer.tokenize(text)
end
# Helper that tokenizes using chroma to validate the lexer

View File

@@ -1,40 +1,15 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "xml"
# These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens
# or change the state machine.
module Tartrazine
enum ActionType
Bygroups
Combined
Include
Pop
Push
Token
Using
Usingbygroup
Usingself
end
struct Action
class Action
property type : String
property xml : XML::Node
property actions : Array(Action) = [] of Action
@content_index : Int32 = 0
@depth : Int32 = 0
@lexer_index : Int32 = 0
@lexer_name : String = ""
@states : Array(String) = [] of String
@states_to_push : Array(String) = [] of String
@token_type : String = ""
@type : ActionType = ActionType::Token
def initialize(t : String, xml : XML::Node?)
@type = ActionType.parse(t.capitalize)
def initialize(@type : String, @xml : XML::Node?)
# Some actions may have actions in them, like this:
# <bygroups>
# <token type="GenericPrompt"/>
@@ -44,110 +19,97 @@ module Tartrazine
#
# The token actions match with the first 2 groups in the regex
# the using action matches the 3rd and shunts it to another lexer
xml.children.each do |node|
@xml.children.each do |node|
next unless node.element?
@actions << Action.new(node.name, node)
end
# Prefetch the attributes we ned from the XML and keep them
case @type
when ActionType::Token
@token_type = xml["type"]
when ActionType::Push
@states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when ActionType::Pop
@depth = xml["depth"].to_i
when ActionType::Using
@lexer_name = xml["lexer"].downcase
when ActionType::Combined
@states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
when ActionType::Usingbygroup
@lexer_index = xml["lexer"].to_i
@content_index = xml["content"].to_i
end
end
# ameba:disable Metrics/CyclomaticComplexity
def emit(match : MatchData, tokenizer : Tokenizer, match_group = 0) : Array(Token)
case @type
when ActionType::Token
raise Exception.new "Can't have a token without a match" if match.empty?
[Token.new(type: @token_type, value: String.new(match[match_group].value))]
when ActionType::Push
to_push = @states_to_push.empty? ? [tokenizer.state_stack.last] : @states_to_push
to_push.each do |state|
if state == "#pop" && tokenizer.state_stack.size > 1
def emit(match : MatchData,
lexer : Lexer, match_group = 0) : Array(Token)
case type
when "token"
raise Exception.new "Can't have a token without a match" if match.nil? || match[0].size == 0
[Token.new(type: xml["type"], value: match[0])]
when "push"
states_to_push = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
if states_to_push.empty?
# Push without a state means push the current state
states_to_push = [lexer.state_stack.last]
end
states_to_push.each do |state|
if state == "#pop"
# Pop the state
tokenizer.state_stack.pop
Log.trace { "Popping state" }
lexer.state_stack.pop
else
# Really push
tokenizer.state_stack << state
lexer.state_stack << state
Log.trace { "Pushed #{lexer.state_stack}" }
end
end
[] of Token
when ActionType::Pop
to_pop = [@depth, tokenizer.state_stack.size - 1].min
tokenizer.state_stack.pop(to_pop)
when "pop"
depth = xml["depth"].to_i
Log.trace { "Popping #{depth} states" }
if lexer.state_stack.size <= depth
Log.trace { "Can't pop #{depth} states, only have #{lexer.state_stack.size}" }
else
lexer.state_stack.pop(depth)
end
[] of Token
when ActionType::Bygroups
when "bygroups"
# FIXME: handle
# ><bygroups>
# <token type="Punctuation"/>
# <token type="Punctuation"/>https://github.com/google/re2/wiki/Syntax
# None
# <token type="LiteralStringRegex"/>
#
# where that None means skipping a group
#
raise Exception.new "Can't have a token without a match" if match.nil?
raise Exception.new "Can't have a bygroups without a match" if match.nil? || match[0].size == 0
# Each group matches an action. If the group match is empty,
# the action is skipped.
result = [] of Token
@actions.each_with_index do |e, i|
begin
next if match[i + 1].size == 0
rescue IndexError
# FIXME: This should not actually happen
# No match for this group
next
end
result += e.emit(match, tokenizer, i + 1)
next if match[i].size == 0
result += e.emit(match, lexer, i)
end
result
when ActionType::Using
when "using"
# Shunt to another lexer entirely
return [] of Token if match.empty?
Tartrazine.lexer(@lexer_name).tokenizer(
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Usingself
return [] of Token if match.nil? || match[0].size == 0
lexer_name = xml["lexer"].downcase
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = match[match_group]
Tartrazine.lexer(lexer_name).tokenize(to_tokenize, usingself: true)
when "usingself"
# Shunt to another copy of this lexer
return [] of Token if match.empty?
tokenizer.lexer.tokenizer(
String.new(match[match_group].value),
secondary: true).to_a
when ActionType::Combined
# Combine two or more states into one anonymous state
new_state = @states.map { |name|
tokenizer.lexer.states[name]
return [] of Token if match.nil? || match[0].size == 0
new_lexer = Lexer.from_xml(lexer.xml)
# Log.trace { "to tokenize: #{match[match_group]}" }
to_tokenize = match[match_group]
new_lexer.tokenize(to_tokenize, usingself: true)
when "combined"
# Combine two states into one anonymous state
states = xml.attributes.select { |attrib|
attrib.name == "state"
}.map &.content
new_state = states.map { |name|
lexer.states[name]
}.reduce { |state1, state2|
state1 + state2
}
tokenizer.lexer.states[new_state.name] = new_state
tokenizer.state_stack << new_state.name
lexer.states[new_state.name] = new_state
lexer.state_stack << new_state.name
[] of Token
when ActionType::Usingbygroup
# Shunt to content-specified lexer
return [] of Token if match.empty?
Tartrazine.lexer(String.new(match[@lexer_index].value)).tokenizer(
String.new(match[@content_index].value),
secondary: true).to_a
else
raise Exception.new("Unknown action type: #{@type}")
raise Exception.new("Unknown action type: #{type}: #{xml}")
end
end
end

View File

@@ -1,73 +0,0 @@
module BytesRegex
extend self
class Regex
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::UCP | LibPCRE2::NO_UTF_CHECK
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
if @re = LibPCRE2.compile(
pattern,
pattern.bytesize,
flags,
out errorcode,
out erroroffset,
nil)
else
msg = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
raise Exception.new "Error #{msg} compiling regex at offset #{erroroffset}"
end
@match_data = LibPCRE2.match_data_create_from_pattern(@re, nil)
end
def finalize
LibPCRE2.match_data_free(@match_data)
LibPCRE2.code_free(@re)
end
def match(str : Bytes, pos = 0) : Array(Match)
rc = LibPCRE2.match(
@re,
str,
str.size,
pos,
LibPCRE2::NO_UTF_CHECK,
@match_data,
nil)
if rc > 0
ovector = LibPCRE2.get_ovector_pointer(@match_data)
(0...rc).map do |i|
m_start = ovector[2 * i]
m_end = ovector[2 * i + 1]
if m_start == m_end
m_value = Bytes.new(0)
else
m_value = str[m_start...m_end]
end
Match.new(m_value, m_start, m_end - m_start)
end
else
[] of Match
end
end
end
struct Match
property value : Bytes
property start : UInt64
property size : UInt64
def initialize(@value : Bytes, @start : UInt64, @size : UInt64)
end
end
end
# pattern = "foo"
# str = "foo bar"
# re = BytesRegex::Regex.new(pattern)
# p! String.new(re.match(str.to_slice)[0].value)

View File

@@ -1,100 +1,92 @@
module Tartrazine
Abbreviations = {
"Background" => "b",
"CodeLine" => "cl",
"Comment" => "c",
"CommentHashbang" => "ch",
"CommentMultiline" => "cm",
"CommentPreproc" => "cp",
"CommentPreprocFile" => "cpf",
"Text" => "t",
"CommentSingle" => "cs",
"CommentSpecial" => "cs",
"Error" => "e",
"Generic" => "g",
"GenericDeleted" => "gd",
"GenericEmph" => "ge",
"GenericError" => "ge",
"GenericHeading" => "gh",
"GenericInserted" => "gi",
"GenericOutput" => "go",
"GenericPrompt" => "gp",
"GenericStrong" => "gs",
"GenericSubheading" => "gs",
"GenericTraceback" => "gt",
"GenericUnderline" => "gu",
"Highlight" => "hl",
"NameVariable" => "nv",
"Keyword" => "k",
"KeywordConstant" => "kc",
"KeywordDeclaration" => "kd",
"KeywordNamespace" => "kn",
"KeywordPseudo" => "kp",
"KeywordReserved" => "kr",
"KeywordType" => "kt",
"LineHighlight" => "lh",
"LineNumbers" => "ln",
"LineNumbersTable" => "lnt",
"LineTable" => "lt",
"LineTableTD" => "lttd",
"NameFunction" => "nf",
"Punctuation" => "p",
"Operator" => "o",
"LiteralNumberInteger" => "lni",
"NameBuiltin" => "nb",
"Name" => "n",
"OperatorWord" => "ow",
"LiteralStringSingle" => "lss",
"Literal" => "l",
"LiteralDate" => "ld",
"LiteralNumber" => "ln",
"LiteralNumberBin" => "lnb",
"NameClass" => "nc",
"CommentMultiline" => "cm",
"LiteralStringRegex" => "lsr",
"KeywordDeclaration" => "kd",
"KeywordConstant" => "kc",
"NameOther" => "no",
"LiteralNumberFloat" => "lnf",
"LiteralNumberHex" => "lnh",
"LiteralNumberInteger" => "lni",
"LiteralNumberIntegerLong" => "lnil",
"LiteralNumberOct" => "lno",
"LiteralOther" => "lo",
"LiteralString" => "ls",
"LiteralStringAffix" => "lsa",
"LiteralStringAtom" => "lsa",
"LiteralStringBacktick" => "lsb",
"LiteralStringBoolean" => "lsb",
"LiteralStringChar" => "lsc",
"LiteralStringDelimiter" => "lsd",
"LiteralStringDoc" => "lsd",
"LiteralStringDouble" => "lsd",
"LiteralStringEscape" => "lse",
"LiteralStringHeredoc" => "lsh",
"LiteralStringInterpol" => "lsi",
"LiteralStringName" => "lsn",
"LiteralStringOther" => "lso",
"LiteralStringRegex" => "lsr",
"LiteralStringSingle" => "lss",
"LiteralStringSymbol" => "lss",
"Name" => "n",
"NameAttribute" => "na",
"NameBuiltin" => "nb",
"NameBuiltinPseudo" => "nbp",
"NameClass" => "nc",
"NameConstant" => "nc",
"NameDecorator" => "nd",
"NameEntity" => "ne",
"NameException" => "ne",
"NameFunction" => "nf",
"NameFunctionMagic" => "nfm",
"NameKeyword" => "nk",
"NameLabel" => "nl",
"KeywordType" => "kt",
"NameNamespace" => "nn",
"NameOperator" => "no",
"NameOther" => "no",
"NameProperty" => "np",
"NamePseudo" => "np",
"NameTag" => "nt",
"NameVariable" => "nv",
"NameVariableAnonymous" => "nva",
"NameVariableClass" => "nvc",
"NameVariableGlobal" => "nvg",
"NameVariableInstance" => "nvi",
"NameVariableMagic" => "nvm",
"None" => "n",
"Operator" => "o",
"OperatorWord" => "ow",
"Other" => "o",
"Punctuation" => "p",
"Text" => "t",
"TextPunctuation" => "tp",
"TextSymbol" => "ts",
"NameAttribute" => "na",
"KeywordReserved" => "kr",
"CommentPreproc" => "cp",
"KeywordNamespace" => "kn",
"NameConstant" => "nc",
"NameLabel" => "nl",
"LiteralString" => "ls",
"LiteralStringChar" => "lsc",
"TextWhitespace" => "tw",
"LiteralStringEscape" => "lse",
"LiteralNumber" => "ln",
"Other" => "o",
"LiteralStringBoolean" => "lsb",
"NameProperty" => "np",
"Comment" => "c",
"NameTag" => "nt",
"LiteralStringOther" => "lso",
"NameVariableGlobal" => "nvg",
"NameBuiltinPseudo" => "nbp",
"LiteralNumberBin" => "lnb",
"KeywordPseudo" => "kp",
"CommentPreprocFile" => "cpf",
"LiteralStringAffix" => "lsa",
"LiteralStringDelimiter" => "lsd",
"LiteralNumberOct" => "lno",
"Error" => "e",
"Generic" => "g",
"LiteralNumberIntegerLong" => "lnil",
"NameDecorator" => "nd",
"LiteralStringInterpol" => "lsi",
"LiteralStringBacktick" => "lsb",
"GenericPrompt" => "gp",
"GenericOutput" => "go",
"LiteralStringName" => "lsn",
"LiteralStringHeredoc" => "lsh",
"LiteralStringSymbol" => "lss",
"NameVariableInstance" => "nvi",
"LiteralOther" => "lo",
"NameVariableClass" => "nvc",
"NameOperator" => "no",
"None" => "n",
"LiteralStringDoc" => "lsd",
"NameException" => "ne",
"GenericSubheading" => "gs",
"GenericStrong" => "gs",
"GenericDeleted" => "gd",
"GenericInserted" => "gi",
"GenericHeading" => "gh",
"NameEntity" => "ne",
"NamePseudo" => "np",
"CommentHashbang" => "ch",
"TextPunctuation" => "tp",
"NameVariableAnonymous" => "nva",
"NameVariableMagic" => "nvm",
"NameFunctionMagic" => "nfm",
"GenericEmph" => "ge",
"GenericUnderline" => "gu",
"LiteralStringAtom" => "lsa",
"LiteralDate" => "ld",
"GenericError" => "ge",
"TextSymbol" => "ts",
"NameKeyword" => "nk",
}
end

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +1,69 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "colorize"
require "./constants.cr"
require "./styles.cr"
require "./tartrazine.cr"
module Tartrazine
# This is the base class for all formatters.
abstract class Formatter
property name : String = ""
property theme : Theme = Tartrazine.theme("default-dark")
# Format the text using the given lexer.
def format(text : String, lexer : Lexer, io : IO = nil) : Nil
def format(text : String, lexer : Lexer, theme : Theme) : String
raise Exception.new("Not implemented")
end
def format(text : String, lexer : Lexer) : String
raise Exception.new("Not implemented")
# ameba:disable Metrics/CyclomaticComplexity
def get_style_defs(theme : Theme) : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token, theme)} {"
# These are set or nil
outp << "color: #{style.color};" if style.color
outp << "background-color: #{style.background};" if style.background
outp << "border: 1px solid #{style.border};" if style.border
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: 400;" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline
outp << "text-decoration: none;" if style.underline == false
outp << "}"
end
end
output
end
end
class Html < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
outp << "<html><head><style>"
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
lexer.tokenize(text).each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
outp << "</code></pre></body></html>"
end
output
end
# Return the styles, if the formatter supports it.
def style_defs : String
raise Exception.new("Not implemented")
# Given a token type, return the CSS class to use.
def get_css_class(token, theme)
return Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
end
end

View File

@@ -1,56 +0,0 @@
require "../formatter"
module Tartrazine
class Ansi < Formatter
property? line_numbers : Bool = false
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), @line_numbers : Bool = false)
end
private def line_label(i : Int32) : String
"#{i + 1}".rjust(4).ljust(5)
end
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : BaseLexer, outp : IO) : Nil
tokenizer = lexer.tokenizer(text)
i = 0
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << colorize(token[:value], token[:type])
if token[:value].includes?("\n")
i += 1
outp << line_label(i) if line_numbers?
end
end
end
def colorize(text : String, token : String) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)
s = theme.styles[token]
else
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
s = theme.styles[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
colorized = text.colorize
s.color.try { |col| colorized = colorized.fore(col.colorize) }
# Intentionally not setting background color
colorized.mode(:bold) if s.bold
colorized.mode(:italic) if s.italic
colorized.mode(:underline) if s.underline
colorized.to_s
end
end
end

View File

@@ -1,132 +0,0 @@
require "../constants/token_abbrevs.cr"
require "../formatter"
require "html"
module Tartrazine
class Html < Formatter
# property line_number_in_table : Bool = false
# property with_classes : Bool = true
property class_prefix : String = ""
property highlight_lines : Array(Range(Int32, Int32)) = [] of Range(Int32, Int32)
property line_number_id_prefix : String = "line-"
property line_number_start : Int32 = 1
property tab_width = 8
property? line_numbers : Bool = false
property? linkable_line_numbers : Bool = true
property? standalone : Bool = false
property? surrounding_pre : Bool = true
property? wrap_long_lines : Bool = false
property weight_of_bold : Int32 = 600
property theme : Theme
def initialize(@theme : Theme = Tartrazine.theme("default-dark"), *,
@highlight_lines = [] of Range(Int32, Int32),
@class_prefix : String = "",
@line_number_id_prefix = "line-",
@line_number_start = 1,
@tab_width = 8,
@line_numbers : Bool = false,
@linkable_line_numbers : Bool = true,
@standalone : Bool = false,
@surrounding_pre : Bool = true,
@wrap_long_lines : Bool = false,
@weight_of_bold : Int32 = 600)
end
def format(text : String, lexer : Lexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : BaseLexer, io : IO) : Nil
pre, post = wrap_standalone
io << pre if standalone?
format_text(text, lexer, io)
io << post if standalone?
end
# Wrap text into a full HTML document, including the CSS for the theme
def wrap_standalone
output = String.build do |outp|
outp << "<!DOCTYPE html><html><head><style>"
outp << style_defs
outp << "</style></head><body>"
end
{output.to_s, "</body></html>"}
end
private def line_label(i : Int32) : String
line_label = "#{i + 1}".rjust(4).ljust(5)
line_class = highlighted?(i + 1) ? "class=\"#{get_css_class("LineHighlight")}\"" : ""
line_id = linkable_line_numbers? ? "id=\"#{line_number_id_prefix}#{i + 1}\"" : ""
"<span #{line_id} #{line_class} style=\"user-select: none;\">#{line_label} </span>"
end
def format_text(text : String, lexer : BaseLexer, outp : IO)
tokenizer = lexer.tokenizer(text)
i = 0
if surrounding_pre?
pre_style = wrap_long_lines? ? "style=\"white-space: pre-wrap; word-break: break-word;\"" : ""
outp << "<pre class=\"#{get_css_class("Background")}\" #{pre_style}>"
end
outp << "<code class=\"#{get_css_class("Background")}\">"
outp << line_label(i) if line_numbers?
tokenizer.each do |token|
outp << "<span class=\"#{get_css_class(token[:type])}\">#{HTML.escape(token[:value])}</span>"
if token[:value].ends_with? "\n"
i += 1
outp << line_label(i) if line_numbers?
end
end
outp << "</code></pre>"
end
# ameba:disable Metrics/CyclomaticComplexity
def style_defs : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{get_css_class(token)} {"
# These are set or nil
outp << "color: ##{style.color.try &.hex};" if style.color
outp << "background-color: ##{style.background.try &.hex};" if style.background
outp << "border: 1px solid ##{style.border.try &.hex};" if style.border
# These are true/false/nil
outp << "border: none;" if style.border == false
outp << "font-weight: bold;" if style.bold
outp << "font-weight: #{@weight_of_bold};" if style.bold == false
outp << "font-style: italic;" if style.italic
outp << "font-style: normal;" if style.italic == false
outp << "text-decoration: underline;" if style.underline
outp << "text-decoration: none;" if style.underline == false
outp << "tab-size: #{tab_width};" if token == "Background"
outp << "}"
end
end
output
end
# Given a token type, return the CSS class to use.
def get_css_class(token : String) : String
if !theme.styles.has_key? token
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
parent = theme.style_parents(token).reverse.find { |dad|
theme.styles.has_key?(dad)
}
theme.styles[token] = theme.styles[parent]
end
class_prefix + Abbreviations[token]
end
# Is this line in the highlighted ranges?
def highlighted?(line : Int) : Bool
highlight_lines.any?(&.includes?(line))
end
end
end

View File

@@ -1,18 +0,0 @@
require "../formatter"
module Tartrazine
class Json < Formatter
property name = "json"
def format(text : String, lexer : BaseLexer) : String
outp = String::Builder.new("")
format(text, lexer, outp)
outp.to_s
end
def format(text : String, lexer : BaseLexer, io : IO) : Nil
tokenizer = lexer.tokenizer(text)
io << Tartrazine::Lexer.collapse_tokens(tokenizer.to_a).to_json
end
end
end

View File

@@ -1,301 +0,0 @@
require "baked_file_system"
require "./constants/lexers"
module Tartrazine
class LexerFiles
extend BakedFileSystem
bake_folder "../lexers", __DIR__
end
# Get the lexer object for a language name
# FIXME: support mimetypes
def self.lexer(name : String? = nil, filename : String? = nil) : BaseLexer
return lexer_by_name(name) if name && name != "autodetect"
return lexer_by_filename(filename) if filename
Lexer.from_xml(LexerFiles.get("/#{LEXERS_BY_NAME["plaintext"]}.xml").gets_to_end)
end
private def self.lexer_by_name(name : String) : BaseLexer
lexer_file_name = LEXERS_BY_NAME.fetch(name.downcase, nil)
return create_delegating_lexer(name) if lexer_file_name.nil? && name.includes? "+"
raise Exception.new("Unknown lexer: #{name}") if lexer_file_name.nil?
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.lexer_by_filename(filename : String) : BaseLexer
candidates = Set(String).new
LEXERS_BY_FILENAME.each do |k, v|
candidates += v.to_set if File.match?(k, File.basename(filename))
end
case candidates.size
when 0
lexer_file_name = LEXERS_BY_NAME["plaintext"]
when 1
lexer_file_name = candidates.first
else
raise Exception.new("Multiple lexers match the filename: #{candidates.to_a.join(", ")}")
end
Lexer.from_xml(LexerFiles.get("/#{lexer_file_name}.xml").gets_to_end)
end
private def self.create_delegating_lexer(name : String) : BaseLexer
language, root = name.split("+", 2)
language_lexer = lexer(language)
root_lexer = lexer(root)
DelegatingLexer.new(language_lexer, root_lexer)
end
# Return a list of all lexers
def self.lexers : Array(String)
LEXERS_BY_NAME.keys.sort!
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
abstract class BaseTokenizer
end
class Tokenizer < BaseTokenizer
include Iterator(Token)
property lexer : BaseLexer
property text : Bytes
property pos : Int32 = 0
@dq = Deque(Token).new
property state_stack = ["root"]
def initialize(@lexer : BaseLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@text = text.to_slice
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
if pos == @text.size
return stop
end
matched = false
while @pos < @text.size
@lexer.states[@state_stack.last].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(@text, @pos, self)
if matched
@pos = new_pos
split_tokens(new_tokens).each { |token| @dq << token }
break
end
end
if !matched
if @text[@pos] == 10u8
@dq << {type: "Text", value: "\n"}
@state_stack = ["root"]
else
@dq << {type: "Error", value: String.new(@text[@pos..@pos])}
end
@pos += 1
break
end
end
self.next
end
# If a token contains a newline, split it into two tokens
def split_tokens(tokens : Array(Token)) : Array(Token)
split_tokens = [] of Token
tokens.each do |token|
if token[:value].includes?("\n")
values = token[:value].split("\n")
values.each_with_index do |value, index|
value += "\n" if index < values.size - 1
split_tokens << {type: token[:type], value: value}
end
else
split_tokens << token
end
end
split_tokens
end
end
abstract class BaseLexer
property config = {
name: "",
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
property states = {} of String => State
def tokenizer(text : String, secondary = false) : BaseTokenizer
Tokenizer.new(self, text, secondary)
end
end
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
class Lexer < BaseLexer
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
result = [] of Tartrazine::Token
tokens = tokens.reject { |token| token[:value] == "" }
tokens.each do |token|
if result.empty?
result << token
next
end
last = result.last
if last[:type] == token[:type]
new_token = {type: last[:type], value: last[:value] + token[:value]}
result.pop
result << new_token
else
result << token
end
end
result
end
def self.from_xml(xml : String) : Lexer
l = Lexer.new
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
node.name == "config"
}
if config
l.config = {
name: xml_to_s(config, name) || "",
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
case_insensitive: xml_to_s(config, case_insensitive) == "true",
ensure_nl: xml_to_s(config, ensure_nl) == "true",
}
end
rules = lexer.children.find { |node|
node.name == "rules"
}
if rules
# Rules contains states 🤷
rules.children.select { |node|
node.name == "state"
}.each do |state_node|
state = State.new
state.name = state_node["name"]
if l.states.has_key?(state.name)
raise Exception.new("Duplicate state: #{state.name}")
else
l.states[state.name] = state
end
# And states contain rules 🤷
state_node.children.select { |node|
node.name == "rule"
}.each do |rule_node|
case rule_node["pattern"]?
when nil
if rule_node.first_element_child.try &.name == "include"
rule = IncludeStateRule.new(rule_node)
else
rule = UnconditionalRule.new(rule_node)
end
else
rule = Rule.new(rule_node,
multiline: !l.config[:not_multiline],
dotall: l.config[:dot_all],
ignorecase: l.config[:case_insensitive])
end
state.rules << rule
end
end
end
end
l
end
end
# A lexer that takes two lexers as arguments. A root lexer
# and a language lexer. Everything is scalled using the
# language lexer, afterwards all `Other` tokens are lexed
# using the root lexer.
#
# This is useful for things like template languages, where
# you have Jinja + HTML or Jinja + CSS and so on.
class DelegatingLexer < BaseLexer
property language_lexer : BaseLexer
property root_lexer : BaseLexer
def initialize(@language_lexer : BaseLexer, @root_lexer : BaseLexer)
end
def tokenizer(text : String, secondary = false) : DelegatingTokenizer
DelegatingTokenizer.new(self, text, secondary)
end
end
# This Tokenizer works with a DelegatingLexer. It first tokenizes
# using the language lexer, and "Other" tokens are tokenized using
# the root lexer.
class DelegatingTokenizer < BaseTokenizer
include Iterator(Token)
@dq = Deque(Token).new
@language_tokenizer : BaseTokenizer
def initialize(@lexer : DelegatingLexer, text : String, secondary = false)
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && @lexer.config[:ensure_nl] && !secondary
text += "\n"
end
@language_tokenizer = @lexer.language_lexer.tokenizer(text, true)
end
def next : Iterator::Stop | Token
if @dq.size > 0
return @dq.shift
end
token = @language_tokenizer.next
if token.is_a? Iterator::Stop
return stop
elsif token.as(Token).[:type] == "Other"
root_tokenizer = @lexer.root_lexer.tokenizer(token.as(Token).[:value], true)
root_tokenizer.each do |root_token|
@dq << root_token
end
else
@dq << token.as(Token)
end
self.next
end
end
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
struct State
property name : String = ""
property rules = [] of BaseRule
def +(other : State)
new_state = State.new
new_state.name = Random.base58(8)
new_state.rules = rules + other.rules
new_state
end
end
end

View File

@@ -1,96 +1,5 @@
require "docopt"
require "./**"
HELP = <<-HELP
tartrazine: a syntax highlighting tool
Usage:
tartrazine (-h, --help)
tartrazine FILE -f html [-t theme][--standalone][--line-numbers]
[-l lexer][-o output]
tartrazine -f html -t theme --css
tartrazine FILE -f terminal [-t theme][-l lexer][--line-numbers]
[-o output]
tartrazine FILE -f json [-o output]
tartrazine --list-themes
tartrazine --list-lexers
tartrazine --list-formatters
tartrazine --version
Options:
-f <formatter> Format to use (html, terminal, json)
-t <theme> Theme to use, see --list-themes [default: default-dark]
-l <lexer> Lexer (language) to use, see --list-lexers [default: autodetect]
-o <output> Output file. Default is stdout.
--standalone Generate a standalone HTML file, which includes
all style information. If not given, it will generate just
a HTML fragment ready to include in your own page.
--css Generate a CSS file for the theme called <theme>.css
--line-numbers Include line numbers in the output
-h, --help Show this screen
-v, --version Show version number
HELP
options = Docopt.docopt(HELP, ARGV)
# Handle version manually
if options["--version"]
puts "tartrazine #{Tartrazine::VERSION}"
exit 0
end
if options["--list-themes"]
puts Tartrazine.themes.join("\n")
exit 0
end
if options["--list-lexers"]
puts Tartrazine.lexers.join("\n")
exit 0
end
if options["--list-formatters"]
puts "html\njson\nterminal"
exit 0
end
theme = Tartrazine.theme(options["-t"].as(String))
if options["-f"]
formatter = options["-f"].as(String)
case formatter
when "html"
formatter = Tartrazine::Html.new
formatter.standalone = options["--standalone"] != nil
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "terminal"
formatter = Tartrazine::Ansi.new
formatter.line_numbers = options["--line-numbers"] != nil
formatter.theme = theme
when "json"
formatter = Tartrazine::Json.new
else
puts "Invalid formatter: #{formatter}"
exit 1
end
if formatter.is_a?(Tartrazine::Html) && options["--css"]
File.open("#{options["-t"].as(String)}.css", "w") do |outf|
outf << formatter.style_defs
end
exit 0
end
lexer = Tartrazine.lexer(name: options["-l"].as(String), filename: options["FILE"].as(String))
input = File.open(options["FILE"].as(String)).gets_to_end
if options["-o"].nil?
outf = STDOUT
else
outf = File.open(options["-o"].as(String), "w")
end
formatter.format(input, lexer, outf)
outf.close
end
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme(ARGV[1])
puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)

0
src/re2.cr Normal file
View File

View File

@@ -1,9 +1,5 @@
require "./actions"
require "./bytes_regex"
require "./formatter"
require "./lexer"
require "./rules"
require "./styles"
# require "cre2"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
@@ -11,15 +7,41 @@ require "./styles"
module Tartrazine
# This rule matches via a regex pattern
alias Regex = BytesRegex::Regex
alias Match = BytesRegex::Match
alias MatchData = Array(Match)
# alias Regex = CRe2::Regex
# alias MatchData = CRe2::MatchDataLike | Regex::MatchData | Nil
alias MatchData = Regex::MatchData | Nil
abstract struct BaseRule
abstract def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
abstract def initialize(node : XML::Node)
class Rule
property pattern : Regex = Regex.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
@actions : Array(Action) = [] of Action
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
return false, pos, [] of Token if match.nil?
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
tokens += action.emit(match, lexer)
end
# Log.trace { "#{xml}, #{match.end}, #{tokens}" }
return true, match[0].size, tokens
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
options = Regex::Options::ANCHORED
options |= Regex::Options::MULTILINE if multiline
options |= Regex::Options::DOTALL if dotall
options |= Regex::Options::IGNORE_CASE if ignorecase
@pattern = Regex.new(node["pattern"], options)
add_actions(node)
end
def add_actions(node : XML::Node)
node.children.each do |child|
@@ -29,42 +51,23 @@ module Tartrazine
end
end
struct Rule < BaseRule
property pattern : Regex = Regex.new ""
def match(text : Bytes, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
match = pattern.match(text, pos)
# No match
return false, pos, [] of Token if match.size == 0
return true, pos + match[0].size, @actions.flat_map(&.emit(match, tokenizer))
end
def initialize(node : XML::Node)
end
def initialize(node : XML::Node, multiline, dotall, ignorecase)
pattern = node["pattern"]
pattern = "(?m)" + pattern if multiline
@pattern = Regex.new(pattern, multiline, dotall, ignorecase, true)
add_actions(node)
end
end
# This rule includes another state. If any of the rules of the
# included state matches, this rule matches.
struct IncludeStateRule < BaseRule
@state : String = ""
class IncludeStateRule < Rule
property state : String = ""
def match(text : Bytes, pos : Int32, tokenizer : Tokenizer) : Tuple(Bool, Int32, Array(Token))
tokenizer.@lexer.states[@state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, tokenizer)
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
Log.trace { "Including state #{state} from #{lexer.state_stack.last}" }
lexer.states[state].rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, lexer)
Log.trace { "#{xml}, #{new_pos}, #{new_tokens}" } if matched
return true, new_pos, new_tokens if matched
end
return false, pos, [] of Token
end
def initialize(node : XML::Node)
@xml = node.to_s
include_node = node.children.find { |child|
child.name == "include"
}
@@ -74,14 +77,17 @@ module Tartrazine
end
# This rule always matches, unconditionally
struct UnconditionalRule < BaseRule
NO_MATCH = [] of Match
def match(text, pos, tokenizer) : Tuple(Bool, Int32, Array(Token))
return true, pos, @actions.flat_map(&.emit(NO_MATCH, tokenizer))
class UnconditionalRule < Rule
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
actions.each do |action|
tokens += action.emit(nil, lexer)
end
return true, pos, tokens
end
def initialize(node : XML::Node)
@xml = node.to_s
add_actions(node)
end
end

View File

@@ -1,45 +1,14 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "sixteen"
require "xml"
module Tartrazine
alias Color = Sixteen::Color
struct ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
def self.theme(name : String) : Theme
begin
return Theme.from_base16(name)
rescue ex : Exception
raise ex unless ex.message.try &.includes? "Theme not found"
end
begin
Theme.from_xml(ThemeFiles.get("/#{name}.xml").gets_to_end)
rescue
raise Exception.new("Theme #{name} not found")
end
return Theme.from_base16(name[7..]) if name.starts_with? "base16_"
path = File.join("styles", "#{name}.xml")
Theme.from_xml(File.read(path))
end
# Return a list of all themes
def self.themes
themes = Set(String).new
ThemeFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
Sixteen::DataFiles.files.each do |file|
themes << file.path.split("/").last.split(".").first
end
themes.to_a.sort!
end
struct Style
class Style
# These properties are tri-state.
# true means it's set
# false means it's not set
@@ -50,9 +19,9 @@ module Tartrazine
# These properties are either set or nil
# (inherit from parent style)
property background : Color?
property border : Color?
property color : Color?
property background : String?
property border : String?
property color : String?
# Styles are incomplete by default and inherit
# from parents. If this is true, this style
@@ -79,7 +48,7 @@ module Tartrazine
end
end
struct Theme
class Theme
property name : String = ""
property styles = {} of String => Style
@@ -122,34 +91,33 @@ module Tartrazine
# The color assignments are adapted from
# https://github.com/mohd-akram/base16-pygments/
theme.styles["Background"] = Style.new(color: t["base05"], background: t["base00"], bold: true)
theme.styles["LineHighlight"] = Style.new(color: t["base0D"], background: t["base01"])
theme.styles["Text"] = Style.new(color: t["base05"])
theme.styles["Error"] = Style.new(color: t["base08"])
theme.styles["Comment"] = Style.new(color: t["base03"])
theme.styles["CommentPreproc"] = Style.new(color: t["base0F"])
theme.styles["CommentPreprocFile"] = Style.new(color: t["base0B"])
theme.styles["Keyword"] = Style.new(color: t["base0E"])
theme.styles["KeywordType"] = Style.new(color: t["base08"])
theme.styles["NameAttribute"] = Style.new(color: t["base0D"])
theme.styles["NameBuiltin"] = Style.new(color: t["base08"])
theme.styles["NameBuiltinPseudo"] = Style.new(color: t["base08"])
theme.styles["NameClass"] = Style.new(color: t["base0D"])
theme.styles["NameConstant"] = Style.new(color: t["base09"])
theme.styles["NameDecorator"] = Style.new(color: t["base09"])
theme.styles["NameFunction"] = Style.new(color: t["base0D"])
theme.styles["NameNamespace"] = Style.new(color: t["base0D"])
theme.styles["NameTag"] = Style.new(color: t["base0E"])
theme.styles["NameVariable"] = Style.new(color: t["base0D"])
theme.styles["NameVariableInstance"] = Style.new(color: t["base08"])
theme.styles["LiteralNumber"] = Style.new(color: t["base09"])
theme.styles["Operator"] = Style.new(color: t["base0C"])
theme.styles["OperatorWord"] = Style.new(color: t["base0E"])
theme.styles["Literal"] = Style.new(color: t["base0B"])
theme.styles["LiteralString"] = Style.new(color: t["base0B"])
theme.styles["LiteralStringInterpol"] = Style.new(color: t["base0F"])
theme.styles["LiteralStringRegex"] = Style.new(color: t["base0C"])
theme.styles["LiteralStringSymbol"] = Style.new(color: t["base09"])
theme.styles["Background"] = Style.new(color: t.palette["base05"], background: t.palette["base00"])
theme.styles["Text"] = Style.new(color: t.palette["base05"])
theme.styles["Error"] = Style.new(color: t.palette["base08"])
theme.styles["Comment"] = Style.new(color: t.palette["base03"])
theme.styles["CommentPreproc"] = Style.new(color: t.palette["base0F"])
theme.styles["CommentPreprocFile"] = Style.new(color: t.palette["base0B"])
theme.styles["Keyword"] = Style.new(color: t.palette["base0E"])
theme.styles["KeywordType"] = Style.new(color: t.palette["base08"])
theme.styles["NameAttribute"] = Style.new(color: t.palette["base0D"])
theme.styles["NameBuiltin"] = Style.new(color: t.palette["base08"])
theme.styles["NameBuiltinPseudo"] = Style.new(color: t.palette["base08"])
theme.styles["NameClass"] = Style.new(color: t.palette["base0D"])
theme.styles["NameConstant"] = Style.new(color: t.palette["base09"])
theme.styles["NameDecorator"] = Style.new(color: t.palette["base09"])
theme.styles["NameFunction"] = Style.new(color: t.palette["base0D"])
theme.styles["NameNamespace"] = Style.new(color: t.palette["base0D"])
theme.styles["NameTag"] = Style.new(color: t.palette["base0E"])
theme.styles["NameVariable"] = Style.new(color: t.palette["base0D"])
theme.styles["NameVariableInstance"] = Style.new(color: t.palette["base08"])
theme.styles["LiteralNumber"] = Style.new(color: t.palette["base09"])
theme.styles["Operator"] = Style.new(color: t.palette["base0C"])
theme.styles["OperatorWord"] = Style.new(color: t.palette["base0E"])
theme.styles["Literal"] = Style.new(color: t.palette["base0B"])
theme.styles["LiteralString"] = Style.new(color: t.palette["base0B"])
theme.styles["LiteralStringInterpol"] = Style.new(color: t.palette["base0F"])
theme.styles["LiteralStringRegex"] = Style.new(color: t.palette["base0C"])
theme.styles["LiteralStringSymbol"] = Style.new(color: t.palette["base09"])
theme
end
@@ -176,32 +144,13 @@ module Tartrazine
s.underline = true if style.includes?("underline")
s.underline = false if style.includes?("nounderline")
s.color = style.find(&.starts_with?("#")).try { |v| Color.new v.split("#").last }
s.background = style.find(&.starts_with?("bg:#")).try { |v| Color.new v.split("#").last }
s.border = style.find(&.starts_with?("border:#")).try { |v| Color.new v.split("#").last }
s.color = style.find(&.starts_with?("#")).try &.split("#").last
s.background = style.find(&.starts_with?("bg:#")).try &.split("#").last
s.border = style.find(&.starts_with?("border:#")).try &.split("#").last
theme.styles[node["type"]] = s
end
# We really want a LineHighlight class
if !theme.styles.has_key?("LineHighlight")
theme.styles["LineHighlight"] = Style.new
theme.styles["LineHighlight"].background = make_highlight_color(theme.styles["Background"].background)
theme.styles["LineHighlight"].bold = true
end
theme
end
# If the color is dark, make it brighter and viceversa
def self.make_highlight_color(base_color)
if base_color.nil?
# WHo knows
return Color.new(127, 127, 127)
end
if base_color.dark?
base_color.lighter(0.2)
else
base_color.darker(0.2)
end
end
end
end

View File

@@ -1,9 +1,5 @@
require "./actions"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "baked_file_system"
require "base58"
require "json"
require "log"
@@ -11,9 +7,183 @@ require "xml"
module Tartrazine
extend self
VERSION = {{ `shards version #{__DIR__}`.chomp.stringify }}
VERSION = "0.1.0"
Log = ::Log.for("tartrazine")
# This implements a lexer for Pygments RegexLexers as expressed
# in Chroma's XML serialization.
#
# For explanations on what actions and states do
# the Pygments documentation is a good place to start.
# https://pygments.org/docs/lexerdevelopment/
# A Lexer state. A state has a name and a list of rules.
# The state machine has a state stack containing references
# to states to decide which rules to apply.
class State
property name : String = ""
property rules = [] of Rule
def +(other : State)
new_state = State.new
new_state.name = Random.base58(8)
new_state.rules = rules + other.rules
new_state
end
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
class Lexer
property config = {
name: "",
aliases: [] of String,
filenames: [] of String,
mime_types: [] of String,
priority: 0.0,
case_insensitive: false,
dot_all: false,
not_multiline: false,
ensure_nl: false,
}
property xml : String = ""
property states = {} of String => State
property state_stack = ["root"]
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
# Loop through the text, applying rules
while pos < text.size
state = states[@state_stack.last]
Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
if matched
# Move position forward, save the tokens,
# tokenize from the new position
Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break
end
Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
Log.trace { "Error at #{pos}" }
tokens << {type: "Error", value: "#{text[pos]}"}
pos += 1
end
end
Lexer.collapse_tokens(tokens)
end
# Collapse consecutive tokens of the same type for easier comparison
# and smaller output
def self.collapse_tokens(tokens : Array(Tartrazine::Token)) : Array(Tartrazine::Token)
result = [] of Tartrazine::Token
tokens = tokens.reject { |token| token[:value] == "" }
tokens.each do |token|
if result.empty?
result << token
next
end
last = result.last
if last[:type] == token[:type]
new_token = {type: last[:type], value: last[:value] + token[:value]}
result.pop
result << new_token
else
result << token
end
end
result
end
# ameba:disable Metrics/CyclomaticComplexity
def self.from_xml(xml : String) : Lexer
l = Lexer.new
l.xml = xml
lexer = XML.parse(xml).first_element_child
if lexer
config = lexer.children.find { |node|
node.name == "config"
}
if config
l.config = {
name: xml_to_s(config, name) || "",
aliases: xml_to_a(config, _alias) || [] of String,
filenames: xml_to_a(config, filename) || [] of String,
mime_types: xml_to_a(config, mime_type) || [] of String,
priority: xml_to_f(config, priority) || 0.0,
not_multiline: xml_to_s(config, not_multiline) == "true",
dot_all: xml_to_s(config, dot_all) == "true",
case_insensitive: xml_to_s(config, case_insensitive) == "true",
ensure_nl: xml_to_s(config, ensure_nl) == "true",
}
end
rules = lexer.children.find { |node|
node.name == "rules"
}
if rules
# Rules contains states 🤷
rules.children.select { |node|
node.name == "state"
}.each do |state_node|
state = State.new
state.name = state_node["name"]
if l.states.has_key?(state.name)
raise Exception.new("Duplicate state: #{state.name}")
else
l.states[state.name] = state
end
# And states contain rules 🤷
state_node.children.select { |node|
node.name == "rule"
}.each do |rule_node|
case rule_node["pattern"]?
when nil
if rule_node.first_element_child.try &.name == "include"
rule = IncludeStateRule.new(rule_node)
else
rule = UnconditionalRule.new(rule_node)
end
else
rule = Rule.new(rule_node,
multiline: !l.config[:not_multiline],
dotall: l.config[:dot_all],
ignorecase: l.config[:case_insensitive])
end
state.rules << rule
end
end
end
end
l
end
end
def self.lexer(name : String) : Lexer
Lexer.from_xml(File.read("lexers/#{name}.xml"))
end
end
# Convenience macros to parse XML

74
styles/base16-snazzy.xml Normal file
View File

@@ -0,0 +1,74 @@
<style name="base16-snazzy">
<entry type="Other" style="#e2e4e5"/>
<entry type="Error" style="#ff5c57"/>
<entry type="Background" style="bg:#282a36"/>
<entry type="Keyword" style="#ff6ac1"/>
<entry type="KeywordConstant" style="#ff6ac1"/>
<entry type="KeywordDeclaration" style="#ff5c57"/>
<entry type="KeywordNamespace" style="#ff6ac1"/>
<entry type="KeywordPseudo" style="#ff6ac1"/>
<entry type="KeywordReserved" style="#ff6ac1"/>
<entry type="KeywordType" style="#9aedfe"/>
<entry type="Name" style="#e2e4e5"/>
<entry type="NameAttribute" style="#57c7ff"/>
<entry type="NameBuiltin" style="#ff5c57"/>
<entry type="NameBuiltinPseudo" style="#e2e4e5"/>
<entry type="NameClass" style="#f3f99d"/>
<entry type="NameConstant" style="#ff9f43"/>
<entry type="NameDecorator" style="#ff9f43"/>
<entry type="NameEntity" style="#e2e4e5"/>
<entry type="NameException" style="#e2e4e5"/>
<entry type="NameFunction" style="#57c7ff"/>
<entry type="NameLabel" style="#ff5c57"/>
<entry type="NameNamespace" style="#e2e4e5"/>
<entry type="NameOther" style="#e2e4e5"/>
<entry type="NameTag" style="#ff6ac1"/>
<entry type="NameVariable" style="#ff5c57"/>
<entry type="NameVariableClass" style="#ff5c57"/>
<entry type="NameVariableGlobal" style="#ff5c57"/>
<entry type="NameVariableInstance" style="#ff5c57"/>
<entry type="Literal" style="#e2e4e5"/>
<entry type="LiteralDate" style="#e2e4e5"/>
<entry type="LiteralString" style="#5af78e"/>
<entry type="LiteralStringBacktick" style="#5af78e"/>
<entry type="LiteralStringChar" style="#5af78e"/>
<entry type="LiteralStringDoc" style="#5af78e"/>
<entry type="LiteralStringDouble" style="#5af78e"/>
<entry type="LiteralStringEscape" style="#5af78e"/>
<entry type="LiteralStringHeredoc" style="#5af78e"/>
<entry type="LiteralStringInterpol" style="#5af78e"/>
<entry type="LiteralStringOther" style="#5af78e"/>
<entry type="LiteralStringRegex" style="#5af78e"/>
<entry type="LiteralStringSingle" style="#5af78e"/>
<entry type="LiteralStringSymbol" style="#5af78e"/>
<entry type="LiteralNumber" style="#ff9f43"/>
<entry type="LiteralNumberBin" style="#ff9f43"/>
<entry type="LiteralNumberFloat" style="#ff9f43"/>
<entry type="LiteralNumberHex" style="#ff9f43"/>
<entry type="LiteralNumberInteger" style="#ff9f43"/>
<entry type="LiteralNumberIntegerLong" style="#ff9f43"/>
<entry type="LiteralNumberOct" style="#ff9f43"/>
<entry type="Operator" style="#ff6ac1"/>
<entry type="OperatorWord" style="#ff6ac1"/>
<entry type="Punctuation" style="#e2e4e5"/>
<entry type="Comment" style="#78787e"/>
<entry type="CommentHashbang" style="#78787e"/>
<entry type="CommentMultiline" style="#78787e"/>
<entry type="CommentSingle" style="#78787e"/>
<entry type="CommentSpecial" style="#78787e"/>
<entry type="CommentPreproc" style="#78787e"/>
<entry type="Generic" style="#e2e4e5"/>
<entry type="GenericDeleted" style="#ff5c57"/>
<entry type="GenericEmph" style="underline #e2e4e5"/>
<entry type="GenericError" style="#ff5c57"/>
<entry type="GenericHeading" style="bold #e2e4e5"/>
<entry type="GenericInserted" style="bold #e2e4e5"/>
<entry type="GenericOutput" style="#43454f"/>
<entry type="GenericPrompt" style="#e2e4e5"/>
<entry type="GenericStrong" style="italic #e2e4e5"/>
<entry type="GenericSubheading" style="bold #e2e4e5"/>
<entry type="GenericTraceback" style="#e2e4e5"/>
<entry type="GenericUnderline" style="underline"/>
<entry type="Text" style="#e2e4e5"/>
<entry type="TextWhitespace" style="#e2e4e5"/>
</style>

13485
x2.html

File diff suppressed because it is too large Load Diff