23 Commits

Author SHA1 Message Date
a0ff4e0118 0.1.1 2024-08-09 11:11:17 -03:00
ece3d4163a Bug 2024-08-09 11:03:32 -03:00
3180168261 Added helper files 2024-08-09 10:32:15 -03:00
5c074344d5 Added helper files 2024-08-09 10:30:20 -03:00
d3439563f2 Use new sixteen api 2024-08-09 10:25:24 -03:00
8167af78f0 Remove JIT flag 2024-08-08 10:25:43 -03:00
ba50934005 Set more flags in regex 2024-08-08 08:37:23 -03:00
d293ec8d76 Set more flags in regex 2024-08-08 08:31:01 -03:00
b43501da98 Nicer ansi rendering 2024-08-07 20:47:02 -03:00
9824431317 tweak includes 2024-08-07 17:47:11 -03:00
2ad3cde7f1 tweak includes 2024-08-07 17:10:20 -03:00
aa1044ed22 ANSI formatter 2024-08-07 17:00:50 -03:00
f0d6b01362 add requires 2024-08-07 16:44:05 -03:00
e1048abe33 add requires 2024-08-07 16:42:38 -03:00
d5581a356e Baked fs 2024-08-07 16:28:26 -03:00
916ab86f60 ignore bench 2024-08-07 15:37:47 -03:00
e0f697f1f9 refactor 2024-08-06 23:34:14 -03:00
0c86e91b0b Use abbreviated token names in css 2024-08-06 21:28:33 -03:00
499cf7f623 Use token abbrevs in HTML 2024-08-06 20:17:26 -03:00
a9ff9bc8ac Added constants for token abbrev 2024-08-06 20:17:26 -03:00
82db232511 Updated README 2024-08-06 18:31:29 -03:00
420b68993c Made base16 work 2024-08-06 18:27:58 -03:00
94bc221545 Added test binary, code to read base16 themes 2024-08-06 18:03:05 -03:00
14 changed files with 371 additions and 70 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@
chroma/
pygments/
shard.lock
.vscode/

15
Dockerfile.static Normal file
View File

@ -0,0 +1,15 @@
FROM --platform=${TARGETPLATFORM:-linux/amd64} alpine:3.20 AS build
RUN apk add --no-cache \
crystal \
shards \
yaml-dev \
yaml-static \
openssl-dev \
openssl-libs-static \
libxml2-dev \
libxml2-static \
zlib-dev \
zlib-static \
xz-dev \
xz-static \
make

7
Makefile Normal file
View File

@ -0,0 +1,7 @@
build: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build -Dstrict_multi_assign -Dno_number_autocast
release: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release
static: $(wildcard src/**/*.cr) $(wildcard lexers/*xml) $(wildcard styles/*xml) shard.yml
shards build --release --static
strip bin/tartrazine

View File

@ -31,6 +31,9 @@ is a subset of Pygments'.
Currently Tartrazine supports ... 241 languages.
It has 332 themes (64 from Chroma, the rest are base16 themes via
[Sixteen](https://github.com/ralsina/sixteen)
## Installation
This will have a CLI tool that can be installed, but it's not

16
build_static.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
set -e
docker run --rm --privileged \
multiarch/qemu-user-static \
--reset -p yes
# Build for AMD64
docker build . -f Dockerfile.static -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-amd64
# Build for ARM64
docker build . -f Dockerfile.static --platform linux/arm64 -t tartrazine-builder
docker run -ti --rm -v "$PWD":/app --platform linux/arm64 --user="$UID" tartrazine-builder /bin/sh -c "cd /app && rm -rf lib shard.lock && make static"
mv bin/tartrazine bin/tartrazine-static-linux-arm64

24
scripts/token_abbrevs.py Normal file
View File

@ -0,0 +1,24 @@
import sys
import string
# Run it as grep token lexers/* | python scripts/token_abbrevs.py
def abbr(line):
return "".join(c for c in line if c in string.ascii_uppercase).lower()
abbrevs = {}
tokens = set([])
for line in sys.stdin:
if "<token" not in line:
continue
line = line.strip()
line = line.split('<token ',1)[-1]
line = line.split('"')[1]
abbrevs[line] = abbr(line)
tokens.add(line)
print("Abbreviations: {")
for k, v in abbrevs.items():
print(f' "{k}" => "{v}",')
print("}")

View File

@ -1,16 +1,20 @@
name: tartrazine
version: 0.1.0
version: 0.1.1
authors:
- Roberto Alsina <roberto.alsina@gmail.com>
targets:
tartrazine:
main: src/tartrazine.cr
main: src/main.cr
dependencies:
baked_file_system:
github: schovi/baked_file_system
base58:
github: crystal-china/base58.cr
sixteen:
github: ralsina/sixteen
crystal: ">= 1.13.0"

View File

@ -1,3 +1,10 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
# These are Lexer actions. When a rule matches, it will
# perform a list of actions. These actions can emit tokens
# or change the state machine.

92
src/constants.cr Normal file
View File

@ -0,0 +1,92 @@
module Tartrazine
Abbreviations = {
"Background" => "b",
"Text" => "t",
"CommentSingle" => "cs",
"CommentSpecial" => "cs",
"NameVariable" => "nv",
"Keyword" => "k",
"NameFunction" => "nf",
"Punctuation" => "p",
"Operator" => "o",
"LiteralNumberInteger" => "lni",
"NameBuiltin" => "nb",
"Name" => "n",
"OperatorWord" => "ow",
"LiteralStringSingle" => "lss",
"Literal" => "l",
"NameClass" => "nc",
"CommentMultiline" => "cm",
"LiteralStringRegex" => "lsr",
"KeywordDeclaration" => "kd",
"KeywordConstant" => "kc",
"NameOther" => "no",
"LiteralNumberFloat" => "lnf",
"LiteralNumberHex" => "lnh",
"LiteralStringDouble" => "lsd",
"KeywordType" => "kt",
"NameNamespace" => "nn",
"NameAttribute" => "na",
"KeywordReserved" => "kr",
"CommentPreproc" => "cp",
"KeywordNamespace" => "kn",
"NameConstant" => "nc",
"NameLabel" => "nl",
"LiteralString" => "ls",
"LiteralStringChar" => "lsc",
"TextWhitespace" => "tw",
"LiteralStringEscape" => "lse",
"LiteralNumber" => "ln",
"Other" => "o",
"LiteralStringBoolean" => "lsb",
"NameProperty" => "np",
"Comment" => "c",
"NameTag" => "nt",
"LiteralStringOther" => "lso",
"NameVariableGlobal" => "nvg",
"NameBuiltinPseudo" => "nbp",
"LiteralNumberBin" => "lnb",
"KeywordPseudo" => "kp",
"CommentPreprocFile" => "cpf",
"LiteralStringAffix" => "lsa",
"LiteralStringDelimiter" => "lsd",
"LiteralNumberOct" => "lno",
"Error" => "e",
"Generic" => "g",
"LiteralNumberIntegerLong" => "lnil",
"NameDecorator" => "nd",
"LiteralStringInterpol" => "lsi",
"LiteralStringBacktick" => "lsb",
"GenericPrompt" => "gp",
"GenericOutput" => "go",
"LiteralStringName" => "lsn",
"LiteralStringHeredoc" => "lsh",
"LiteralStringSymbol" => "lss",
"NameVariableInstance" => "nvi",
"LiteralOther" => "lo",
"NameVariableClass" => "nvc",
"NameOperator" => "no",
"None" => "n",
"LiteralStringDoc" => "lsd",
"NameException" => "ne",
"GenericSubheading" => "gs",
"GenericStrong" => "gs",
"GenericDeleted" => "gd",
"GenericInserted" => "gi",
"GenericHeading" => "gh",
"NameEntity" => "ne",
"NamePseudo" => "np",
"CommentHashbang" => "ch",
"TextPunctuation" => "tp",
"NameVariableAnonymous" => "nva",
"NameVariableMagic" => "nvm",
"NameFunctionMagic" => "nfm",
"GenericEmph" => "ge",
"GenericUnderline" => "gu",
"LiteralStringAtom" => "lsa",
"LiteralDate" => "ld",
"GenericError" => "ge",
"TextSymbol" => "ts",
"NameKeyword" => "nk",
}
end

View File

@ -1,5 +1,10 @@
require "./tartrazine.cr"
require "./styles.cr"
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "colorize"
module Tartrazine
# This is the base class for all formatters.
@ -10,15 +15,70 @@ module Tartrazine
raise Exception.new("Not implemented")
end
def get_style_defs(theme : Theme) : String
raise Exception.new("Not implemented")
end
end
class Ansi < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
lexer.tokenize(text).each do |token|
outp << self.colorize(token[:value], token[:type], theme)
end
end
output
end
def colorize(text : String, token : String, theme : Theme) : String
style = theme.styles.fetch(token, nil)
return text if style.nil?
if theme.styles.has_key?(token)
s = theme.styles[token]
else
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
s = theme.styles[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}]
end
colorized = text.colorize
s.color.try { |c| colorized = colorized.fore(c.colorize) }
# Intentionally not setting background color
colorized.mode(:bold) if s.bold
colorized.mode(:italic) if s.italic
colorized.mode(:underline) if s.underline
colorized.to_s
end
end
class Html < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
outp << "<html><head><style>"
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << "<pre class=\"#{get_css_class("Background", theme)}\"><code class=\"#{get_css_class("Background", theme)}\">"
lexer.tokenize(text).each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
outp << "</code></pre></body></html>"
end
output
end
# ameba:disable Metrics/CyclomaticComplexity
def get_style_defs(theme : Theme) : String
output = String.build do |outp|
theme.styles.each do |token, style|
outp << ".#{token} {"
outp << ".#{get_css_class(token, theme)} {"
# These are set or nil
outp << "color: #{style.color};" if style.color
outp << "background-color: #{style.background};" if style.background
outp << "border: 1px solid #{style.border};" if style.border
outp << "color: #{style.color.try &.hex};" if style.color
outp << "background-color: #{style.background.try &.hex};" if style.background
outp << "border: 1px solid #{style.border.try &.hex};" if style.border
# These are true/false/nil
outp << "border: none;" if style.border == false
@ -34,39 +94,18 @@ module Tartrazine
end
output
end
end
class Html < Formatter
def format(text : String, lexer : Lexer, theme : Theme) : String
output = String.build do |outp|
outp << "<html><head><style>"
outp << get_style_defs(theme)
outp << "</style></head><body>"
outp << "<pre class=\"Background\"><code class=\"Background\">"
lexer.tokenize(text).each do |token|
fragment = "<span class=\"#{get_css_class(token[:type], theme)}\">#{token[:value]}</span>"
outp << fragment
end
outp << "</code></pre></body></html>"
end
output
end
# Given a token type, return the CSS class to use.
def get_css_class(token, theme)
return token if theme.styles.has_key?(token)
return Abbreviations[token] if theme.styles.has_key?(token)
# Themes don't contain information for each specific
# token type. However, they may contain information
# for a parent style. Worst case, we go to the root
# (Background) style.
theme.style_parents(token).reverse.find { |parent|
Abbreviations[theme.style_parents(token).reverse.find { |parent|
theme.styles.has_key?(parent)
}
}]
end
end
end
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme("catppuccin-macchiato")
puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)

5
src/main.cr Normal file
View File

@ -0,0 +1,5 @@
require "./**"
lexer = Tartrazine.lexer("crystal")
theme = Tartrazine.theme(ARGV[1])
puts Tartrazine::Html.new.format(File.read(ARGV[0]), lexer, theme)

View File

@ -1,22 +1,28 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
# These are lexer rules. They match with the text being parsed
# and perform actions, either emitting tokens or changing the
# state of the lexer.
module Tartrazine
# This rule matches via a regex pattern
class Rule
property pattern : Regex = Re2.new ""
property actions : Array(Action) = [] of Action
property xml : String = "foo"
def match(text, pos, lexer) : Tuple(Bool, Int32, Array(Token))
tokens = [] of Token
match = pattern.match(text, pos)
# We don't match if the match doesn't move the cursor
# because that causes infinite loops
Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
return false, pos, [] of Token if match.nil? || match.end == 0
# Log.trace { "#{match}, #{pattern.inspect}, #{text}, #{pos}" }
tokens = [] of Token
# Emit the tokens
actions.each do |action|
# Emit the token
@ -28,7 +34,12 @@ module Tartrazine
def initialize(node : XML::Node, multiline, dotall, ignorecase)
@xml = node.to_s
@pattern = Re2.new(node["pattern"], multiline, dotall, ignorecase)
@pattern = Re2.new(
node["pattern"],
multiline,
dotall,
ignorecase,
anchored: true)
add_actions(node)
end
@ -80,4 +91,25 @@ module Tartrazine
add_actions(node)
end
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false, anchored = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
flags |= LibPCRE2::ANCHORED if anchored
flags |= LibPCRE2::NO_UTF_CHECK
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
end
end

View File

@ -1,9 +1,23 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "sixteen"
require "xml"
module Tartrazine
alias Color = Sixteen::Color
def self.theme(name : String) : Theme
path = File.join("styles", "#{name}.xml")
Theme.from_xml(File.read(path))
return Theme.from_base16(name[7..]) if name.starts_with? "base16_"
Theme.from_xml(ThemeFiles.get("/#{name}.xml").gets_to_end)
end
class ThemeFiles
extend BakedFileSystem
bake_folder "../styles", __DIR__
end
class Style
@ -17,9 +31,9 @@ module Tartrazine
# These properties are either set or nil
# (inherit from parent style)
property background : String?
property border : String?
property color : String?
property background : Color?
property border : Color?
property color : Color?
# Styles are incomplete by default and inherit
# from parents. If this is true, this style
@ -27,6 +41,9 @@ module Tartrazine
# anything
property? complete : Bool = false
def initialize(@color = nil, @background = nil, @border = nil, @bold = nil, @italic = nil, @underline = nil)
end
macro merge_prop(prop)
new.{{prop}} = other.{{prop}}.nil? ? self.{{prop}} : other.{{prop}}
end
@ -78,6 +95,44 @@ module Tartrazine
parents
end
# Load from a base16 theme name using Sixteen
def self.from_base16(name : String) : Theme
t = Sixteen.theme(name)
theme = Theme.new
theme.name = name
# The color assignments are adapted from
# https://github.com/mohd-akram/base16-pygments/
theme.styles["Background"] = Style.new(color: t["base05"], background: t["base00"])
theme.styles["Text"] = Style.new(color: t["base05"])
theme.styles["Error"] = Style.new(color: t["base08"])
theme.styles["Comment"] = Style.new(color: t["base03"])
theme.styles["CommentPreproc"] = Style.new(color: t["base0F"])
theme.styles["CommentPreprocFile"] = Style.new(color: t["base0B"])
theme.styles["Keyword"] = Style.new(color: t["base0E"])
theme.styles["KeywordType"] = Style.new(color: t["base08"])
theme.styles["NameAttribute"] = Style.new(color: t["base0D"])
theme.styles["NameBuiltin"] = Style.new(color: t["base08"])
theme.styles["NameBuiltinPseudo"] = Style.new(color: t["base08"])
theme.styles["NameClass"] = Style.new(color: t["base0D"])
theme.styles["NameConstant"] = Style.new(color: t["base09"])
theme.styles["NameDecorator"] = Style.new(color: t["base09"])
theme.styles["NameFunction"] = Style.new(color: t["base0D"])
theme.styles["NameNamespace"] = Style.new(color: t["base0D"])
theme.styles["NameTag"] = Style.new(color: t["base0E"])
theme.styles["NameVariable"] = Style.new(color: t["base0D"])
theme.styles["NameVariableInstance"] = Style.new(color: t["base08"])
theme.styles["LiteralNumber"] = Style.new(color: t["base09"])
theme.styles["Operator"] = Style.new(color: t["base0C"])
theme.styles["OperatorWord"] = Style.new(color: t["base0E"])
theme.styles["Literal"] = Style.new(color: t["base0B"])
theme.styles["LiteralString"] = Style.new(color: t["base0B"])
theme.styles["LiteralStringInterpol"] = Style.new(color: t["base0F"])
theme.styles["LiteralStringRegex"] = Style.new(color: t["base0C"])
theme.styles["LiteralStringSymbol"] = Style.new(color: t["base09"])
theme
end
# Load from a Chroma XML file
def self.from_xml(xml : String) : Theme
document = XML.parse(xml)
@ -101,9 +156,9 @@ module Tartrazine
s.underline = true if style.includes?("underline")
s.underline = false if style.includes?("nounderline")
s.color = style.find(&.starts_with?("#")).try &.split("#").last
s.background = style.find(&.starts_with?("bg:#")).try &.split("#").last
s.border = style.find(&.starts_with?("border:#")).try &.split("#").last
s.color = style.find(&.starts_with?("#")).try { |v| Color.new v.split("#").last }
s.background = style.find(&.starts_with?("bg:#")).try { |v| Color.new v.split("#").last }
s.border = style.find(&.starts_with?("border:#")).try { |v| Color.new v.split("#").last }
theme.styles[node["type"]] = s
end

View File

@ -1,5 +1,10 @@
require "./actions"
require "./constants"
require "./formatter"
require "./rules"
require "./styles"
require "./tartrazine"
require "baked_file_system"
require "base58"
require "json"
require "log"
@ -7,7 +12,7 @@ require "xml"
module Tartrazine
extend self
VERSION = "0.1.0"
VERSION = "0.1.1"
Log = ::Log.for("tartrazine")
@ -33,6 +38,12 @@ module Tartrazine
end
end
class LexerFiles
extend BakedFileSystem
bake_folder "../lexers", __DIR__
end
# A token, the output of the tokenizer
alias Token = NamedTuple(type: String, value: String)
@ -54,31 +65,40 @@ module Tartrazine
property state_stack = ["root"]
# Turn the text into a list of tokens.
# Turn the text into a list of tokens. The `usingself` parameter
# is true when the lexer is being used to tokenize a string
# from a larger text that is already being tokenized.
# So, when it's true, we don't modify the text.
def tokenize(text, usingself = false) : Array(Token)
@state_stack = ["root"]
tokens = [] of Token
pos = 0
matched = false
# Respect the `ensure_nl` config option
if text.size > 0 && text[-1] != '\n' && config[:ensure_nl] && !usingself
text += "\n"
end
# Loop through the text, applying rules
while pos < text.size
state = states[@state_stack.last]
Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
# Log.trace { "Stack is #{@state_stack} State is #{state.name}, pos is #{pos}, text is #{text[pos..pos + 10]}" }
state.rules.each do |rule|
matched, new_pos, new_tokens = rule.match(text, pos, self)
if matched
Log.trace { "MATCHED: #{rule.xml}" }
# Move position forward, save the tokens,
# tokenize from the new position
# Log.trace { "MATCHED: #{rule.xml}" }
pos = new_pos
tokens += new_tokens
break # We go back to processing with current state
break
end
Log.trace { "NOT MATCHED: #{rule.xml}" }
# Log.trace { "NOT MATCHED: #{rule.xml}" }
end
# If no rule matches, emit an error token
unless matched
Log.trace { "Error at #{pos}" }
# Log.trace { "Error at #{pos}" }
tokens << {type: "Error", value: "#{text[pos]}"}
pos += 1
end
@ -173,26 +193,7 @@ module Tartrazine
end
def self.lexer(name : String) : Lexer
Lexer.from_xml(File.read("lexers/#{name}.xml"))
end
# This is a hack to workaround that Crystal seems to disallow
# having regexes multiline but not dot_all
class Re2 < Regex
@source = "fa"
@options = Regex::Options::None
@jit = true
def initialize(pattern : String, multiline = false, dotall = false, ignorecase = false)
flags = LibPCRE2::UTF | LibPCRE2::DUPNAMES |
LibPCRE2::UCP | LibPCRE2::ANCHORED
flags |= LibPCRE2::MULTILINE if multiline
flags |= LibPCRE2::DOTALL if dotall
flags |= LibPCRE2::CASELESS if ignorecase
@re = Regex::PCRE2.compile(pattern, flags) do |error_message|
raise Exception.new(error_message)
end
end
Lexer.from_xml(LexerFiles.get("/#{name}.xml").gets_to_end)
end
end