156 lines
3.6 KiB
TOML
156 lines
3.6 KiB
TOML
|
|
# Configuration for the typos spell checker.
|
||
|
|
# https://github.com/crate-ci/typos
|
||
|
|
|
||
|
|
[default]
|
||
|
|
extend-ignore-re = [
|
||
|
|
# Hex strings (e.g. commit hashes, checksums)
|
||
|
|
"[a-fA-F0-9]{32,}",
|
||
|
|
# Base64 encoded strings
|
||
|
|
"[A-Za-z0-9+/]{40,}={0,2}",
|
||
|
|
# URLs — avoid flagging path segments
|
||
|
|
"https?://[^\\s]+",
|
||
|
|
# ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.)
|
||
|
|
"fo:[a-z-]+",
|
||
|
|
"xsl-fo-compatible",
|
||
|
|
# Unicode escapes (e.g. \u{0065})
|
||
|
|
"'\\\\u\\{[0-9a-fA-F]+\\}'",
|
||
|
|
# Foreign language text in test strings and OCR backend language lists
|
||
|
|
'"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"',
|
||
|
|
# GPU runner name
|
||
|
|
"runner-gpu-l4",
|
||
|
|
]
|
||
|
|
|
||
|
|
[default.extend-words]
|
||
|
|
# Project-specific terms that are not typos.
|
||
|
|
kreuzberg = "kreuzberg"
|
||
|
|
zensical = "zensical"
|
||
|
|
tesseract = "tesseract"
|
||
|
|
onnx = "onnx"
|
||
|
|
surrealdb = "surrealdb"
|
||
|
|
docling = "docling"
|
||
|
|
markitdown = "markitdown"
|
||
|
|
pymupdf = "pymupdf"
|
||
|
|
openwebui = "openwebui"
|
||
|
|
webui = "webui"
|
||
|
|
wasm = "wasm"
|
||
|
|
mkdocs = "mkdocs"
|
||
|
|
mkdocstrings = "mkdocstrings"
|
||
|
|
rumdl = "rumdl"
|
||
|
|
flate = "flate"
|
||
|
|
tha = "tha"
|
||
|
|
# Domain-specific terms
|
||
|
|
opf = "opf"
|
||
|
|
hocr = "hocr"
|
||
|
|
odf = "odf"
|
||
|
|
# LaTeX environments
|
||
|
|
multline = "multline"
|
||
|
|
hom = "hom"
|
||
|
|
# RTF control words
|
||
|
|
headerr = "headerr"
|
||
|
|
pard = "pard"
|
||
|
|
# DOC format field names
|
||
|
|
edn = "edn"
|
||
|
|
# DOCX measurement units (twips, 50ths of percent, 240ths of line)
|
||
|
|
ths = "ths"
|
||
|
|
# Test data / examples
|
||
|
|
ove = "ove"
|
||
|
|
ges = "ges"
|
||
|
|
caf = "caf"
|
||
|
|
helo = "helo"
|
||
|
|
# Common short variable names / identifiers in code
|
||
|
|
fo = "fo"
|
||
|
|
pn = "pn"
|
||
|
|
thr = "thr"
|
||
|
|
nd = "nd"
|
||
|
|
ba = "ba"
|
||
|
|
iy = "iy"
|
||
|
|
siz = "siz"
|
||
|
|
# Tesseract upstream API spelling
|
||
|
|
extention = "extention"
|
||
|
|
# PDFium upstream constant/function naming
|
||
|
|
portait = "portait"
|
||
|
|
fith = "fith"
|
||
|
|
threed = "threed"
|
||
|
|
chlidren = "chlidren"
|
||
|
|
formated = "formated"
|
||
|
|
specifing = "specifing"
|
||
|
|
# English variants / valid words flagged incorrectly
|
||
|
|
unparseable = "unparseable"
|
||
|
|
# Dutch word in crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs module doc
|
||
|
|
# ("relatie-id" = Dutch for "relation-id", from Microsoft Word broken-image placeholder)
|
||
|
|
relatie = "relatie"
|
||
|
|
# PaddleOCR upstream naming
|
||
|
|
substract = "substract"
|
||
|
|
charater = "charater"
|
||
|
|
# OCR language codes (ISO 639)
|
||
|
|
inh = "inh"
|
||
|
|
bre = "bre"
|
||
|
|
yor = "yor"
|
||
|
|
# English suffix patterns in semantic analysis
|
||
|
|
ment = "ment"
|
||
|
|
# PaddleOCR upstream naming
|
||
|
|
cliper = "cliper"
|
||
|
|
# PDFium upstream doc typos
|
||
|
|
similarily = "similarily"
|
||
|
|
execpt = "execpt"
|
||
|
|
faiure = "faiure"
|
||
|
|
# Tesseract upstream code
|
||
|
|
splitted = "splitted"
|
||
|
|
# Short words flagged in code/data contexts
|
||
|
|
mis = "mis"
|
||
|
|
tre = "tre"
|
||
|
|
ist = "ist"
|
||
|
|
ein = "ein"
|
||
|
|
runner-gpu-l4 = "runner-gpu-l4"
|
||
|
|
l4 = "l4"
|
||
|
|
|
||
|
|
[default.extend-identifiers]
|
||
|
|
# Allow these identifiers in code
|
||
|
|
PyMuPDF = "PyMuPDF"
|
||
|
|
MarkItDown = "MarkItDown"
|
||
|
|
SurrealDB = "SurrealDB"
|
||
|
|
PDFium = "PDFium"
|
||
|
|
WebUI = "WebUI"
|
||
|
|
traineddata = "traineddata"
|
||
|
|
|
||
|
|
[files]
|
||
|
|
extend-exclude = [
|
||
|
|
# Test fixtures and vendor files
|
||
|
|
"test_documents/",
|
||
|
|
"fixtures/",
|
||
|
|
# Lock files
|
||
|
|
"*.lock",
|
||
|
|
"pnpm-lock.yaml",
|
||
|
|
# Build artifacts
|
||
|
|
"target/",
|
||
|
|
"node_modules/",
|
||
|
|
"dist/",
|
||
|
|
"site/",
|
||
|
|
# Code snippets (validated separately by alef-snippets)
|
||
|
|
"docs/snippets/",
|
||
|
|
# Generated files (e2e tests, bindgen)
|
||
|
|
"e2e/",
|
||
|
|
"**/bindgen/",
|
||
|
|
# Stopwords files (foreign language words)
|
||
|
|
"**/stopwords/",
|
||
|
|
# Test data (hOCR samples, etc.)
|
||
|
|
"**/test_data/",
|
||
|
|
# Patch files (upstream diffs)
|
||
|
|
"**/*.diff",
|
||
|
|
"**/*.patch",
|
||
|
|
# PDF text repair tests (intentionally broken text)
|
||
|
|
"**/text_repair.rs",
|
||
|
|
# Changelog (contains intentional examples of garbled text)
|
||
|
|
"CHANGELOG.md",
|
||
|
|
"docs/CHANGELOG.md",
|
||
|
|
# Vendored / third-party code
|
||
|
|
"**/vendor/",
|
||
|
|
"**/vendored/",
|
||
|
|
# Binary files
|
||
|
|
"*.whl",
|
||
|
|
"*.tar.gz",
|
||
|
|
"*.png",
|
||
|
|
"*.ico",
|
||
|
|
"*.svg",
|
||
|
|
]
|