Files
fil/.typos.toml

156 lines
3.6 KiB
TOML
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
# Configuration for the typos spell checker.
# https://github.com/crate-ci/typos
[default]
extend-ignore-re = [
# Hex strings (e.g. commit hashes, checksums)
"[a-fA-F0-9]{32,}",
# Base64 encoded strings
"[A-Za-z0-9+/]{40,}={0,2}",
# URLs — avoid flagging path segments
"https?://[^\\s]+",
# ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.)
"fo:[a-z-]+",
"xsl-fo-compatible",
# Unicode escapes (e.g. \u{0065})
"'\\\\u\\{[0-9a-fA-F]+\\}'",
# Foreign language text in test strings and OCR backend language lists
'"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"',
# GPU runner name
"runner-gpu-l4",
]
[default.extend-words]
# Project-specific terms that are not typos.
kreuzberg = "kreuzberg"
zensical = "zensical"
tesseract = "tesseract"
onnx = "onnx"
surrealdb = "surrealdb"
docling = "docling"
markitdown = "markitdown"
pymupdf = "pymupdf"
openwebui = "openwebui"
webui = "webui"
wasm = "wasm"
mkdocs = "mkdocs"
mkdocstrings = "mkdocstrings"
rumdl = "rumdl"
flate = "flate"
tha = "tha"
# Domain-specific terms
opf = "opf"
hocr = "hocr"
odf = "odf"
# LaTeX environments
multline = "multline"
hom = "hom"
# RTF control words
headerr = "headerr"
pard = "pard"
# DOC format field names
edn = "edn"
# DOCX measurement units (twips, 50ths of percent, 240ths of line)
ths = "ths"
# Test data / examples
ove = "ove"
ges = "ges"
caf = "caf"
helo = "helo"
# Common short variable names / identifiers in code
fo = "fo"
pn = "pn"
thr = "thr"
nd = "nd"
ba = "ba"
iy = "iy"
siz = "siz"
# Tesseract upstream API spelling
extention = "extention"
# PDFium upstream constant/function naming
portait = "portait"
fith = "fith"
threed = "threed"
chlidren = "chlidren"
formated = "formated"
specifing = "specifing"
# English variants / valid words flagged incorrectly
unparseable = "unparseable"
# Dutch word in crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs module doc
# ("relatie-id" = Dutch for "relation-id", from Microsoft Word broken-image placeholder)
relatie = "relatie"
# PaddleOCR upstream naming
substract = "substract"
charater = "charater"
# OCR language codes (ISO 639)
inh = "inh"
bre = "bre"
yor = "yor"
# English suffix patterns in semantic analysis
ment = "ment"
# PaddleOCR upstream naming
cliper = "cliper"
# PDFium upstream doc typos
similarily = "similarily"
execpt = "execpt"
faiure = "faiure"
# Tesseract upstream code
splitted = "splitted"
# Short words flagged in code/data contexts
mis = "mis"
tre = "tre"
ist = "ist"
ein = "ein"
runner-gpu-l4 = "runner-gpu-l4"
l4 = "l4"
[default.extend-identifiers]
# Allow these identifiers in code
PyMuPDF = "PyMuPDF"
MarkItDown = "MarkItDown"
SurrealDB = "SurrealDB"
PDFium = "PDFium"
WebUI = "WebUI"
traineddata = "traineddata"
[files]
extend-exclude = [
# Test fixtures and vendor files
"test_documents/",
"fixtures/",
# Lock files
"*.lock",
"pnpm-lock.yaml",
# Build artifacts
"target/",
"node_modules/",
"dist/",
"site/",
# Code snippets (validated separately by alef-snippets)
"docs/snippets/",
# Generated files (e2e tests, bindgen)
"e2e/",
"**/bindgen/",
# Stopwords files (foreign language words)
"**/stopwords/",
# Test data (hOCR samples, etc.)
"**/test_data/",
# Patch files (upstream diffs)
"**/*.diff",
"**/*.patch",
# PDF text repair tests (intentionally broken text)
"**/text_repair.rs",
# Changelog (contains intentional examples of garbled text)
"CHANGELOG.md",
"docs/CHANGELOG.md",
# Vendored / third-party code
"**/vendor/",
"**/vendored/",
# Binary files
"*.whl",
"*.tar.gz",
"*.png",
"*.ico",
"*.svg",
]