This commit is contained in:
155
.typos.toml
Normal file
155
.typos.toml
Normal file
@@ -0,0 +1,155 @@
|
||||
# Configuration for the typos spell checker.
|
||||
# https://github.com/crate-ci/typos
|
||||
|
||||
[default]
|
||||
extend-ignore-re = [
|
||||
# Hex strings (e.g. commit hashes, checksums)
|
||||
"[a-fA-F0-9]{32,}",
|
||||
# Base64 encoded strings
|
||||
"[A-Za-z0-9+/]{40,}={0,2}",
|
||||
# URLs — avoid flagging path segments
|
||||
"https?://[^\\s]+",
|
||||
# ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.)
|
||||
"fo:[a-z-]+",
|
||||
"xsl-fo-compatible",
|
||||
# Unicode escapes (e.g. \u{0065})
|
||||
"'\\\\u\\{[0-9a-fA-F]+\\}'",
|
||||
# Foreign language text in test strings and OCR backend language lists
|
||||
'"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"',
|
||||
# GPU runner name
|
||||
"runner-gpu-l4",
|
||||
]
|
||||
|
||||
[default.extend-words]
|
||||
# Project-specific terms that are not typos.
|
||||
kreuzberg = "kreuzberg"
|
||||
zensical = "zensical"
|
||||
tesseract = "tesseract"
|
||||
onnx = "onnx"
|
||||
surrealdb = "surrealdb"
|
||||
docling = "docling"
|
||||
markitdown = "markitdown"
|
||||
pymupdf = "pymupdf"
|
||||
openwebui = "openwebui"
|
||||
webui = "webui"
|
||||
wasm = "wasm"
|
||||
mkdocs = "mkdocs"
|
||||
mkdocstrings = "mkdocstrings"
|
||||
rumdl = "rumdl"
|
||||
flate = "flate"
|
||||
tha = "tha"
|
||||
# Domain-specific terms
|
||||
opf = "opf"
|
||||
hocr = "hocr"
|
||||
odf = "odf"
|
||||
# LaTeX environments
|
||||
multline = "multline"
|
||||
hom = "hom"
|
||||
# RTF control words
|
||||
headerr = "headerr"
|
||||
pard = "pard"
|
||||
# DOC format field names
|
||||
edn = "edn"
|
||||
# DOCX measurement units (twips, 50ths of percent, 240ths of line)
|
||||
ths = "ths"
|
||||
# Test data / examples
|
||||
ove = "ove"
|
||||
ges = "ges"
|
||||
caf = "caf"
|
||||
helo = "helo"
|
||||
# Common short variable names / identifiers in code
|
||||
fo = "fo"
|
||||
pn = "pn"
|
||||
thr = "thr"
|
||||
nd = "nd"
|
||||
ba = "ba"
|
||||
iy = "iy"
|
||||
siz = "siz"
|
||||
# Tesseract upstream API spelling
|
||||
extention = "extention"
|
||||
# PDFium upstream constant/function naming
|
||||
portait = "portait"
|
||||
fith = "fith"
|
||||
threed = "threed"
|
||||
chlidren = "chlidren"
|
||||
formated = "formated"
|
||||
specifing = "specifing"
|
||||
# English variants / valid words flagged incorrectly
|
||||
unparseable = "unparseable"
|
||||
# Dutch word in crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs module doc
|
||||
# ("relatie-id" = Dutch for "relation-id", from Microsoft Word broken-image placeholder)
|
||||
relatie = "relatie"
|
||||
# PaddleOCR upstream naming
|
||||
substract = "substract"
|
||||
charater = "charater"
|
||||
# OCR language codes (ISO 639)
|
||||
inh = "inh"
|
||||
bre = "bre"
|
||||
yor = "yor"
|
||||
# English suffix patterns in semantic analysis
|
||||
ment = "ment"
|
||||
# PaddleOCR upstream naming
|
||||
cliper = "cliper"
|
||||
# PDFium upstream doc typos
|
||||
similarily = "similarily"
|
||||
execpt = "execpt"
|
||||
faiure = "faiure"
|
||||
# Tesseract upstream code
|
||||
splitted = "splitted"
|
||||
# Short words flagged in code/data contexts
|
||||
mis = "mis"
|
||||
tre = "tre"
|
||||
ist = "ist"
|
||||
ein = "ein"
|
||||
runner-gpu-l4 = "runner-gpu-l4"
|
||||
l4 = "l4"
|
||||
|
||||
[default.extend-identifiers]
|
||||
# Allow these identifiers in code
|
||||
PyMuPDF = "PyMuPDF"
|
||||
MarkItDown = "MarkItDown"
|
||||
SurrealDB = "SurrealDB"
|
||||
PDFium = "PDFium"
|
||||
WebUI = "WebUI"
|
||||
traineddata = "traineddata"
|
||||
|
||||
[files]
|
||||
extend-exclude = [
|
||||
# Test fixtures and vendor files
|
||||
"test_documents/",
|
||||
"fixtures/",
|
||||
# Lock files
|
||||
"*.lock",
|
||||
"pnpm-lock.yaml",
|
||||
# Build artifacts
|
||||
"target/",
|
||||
"node_modules/",
|
||||
"dist/",
|
||||
"site/",
|
||||
# Code snippets (validated separately by alef-snippets)
|
||||
"docs/snippets/",
|
||||
# Generated files (e2e tests, bindgen)
|
||||
"e2e/",
|
||||
"**/bindgen/",
|
||||
# Stopwords files (foreign language words)
|
||||
"**/stopwords/",
|
||||
# Test data (hOCR samples, etc.)
|
||||
"**/test_data/",
|
||||
# Patch files (upstream diffs)
|
||||
"**/*.diff",
|
||||
"**/*.patch",
|
||||
# PDF text repair tests (intentionally broken text)
|
||||
"**/text_repair.rs",
|
||||
# Changelog (contains intentional examples of garbled text)
|
||||
"CHANGELOG.md",
|
||||
"docs/CHANGELOG.md",
|
||||
# Vendored / third-party code
|
||||
"**/vendor/",
|
||||
"**/vendored/",
|
||||
# Binary files
|
||||
"*.whl",
|
||||
"*.tar.gz",
|
||||
"*.png",
|
||||
"*.ico",
|
||||
"*.svg",
|
||||
]
|
||||
Reference in New Issue
Block a user