# Configuration for the typos spell checker. # https://github.com/crate-ci/typos [default] extend-ignore-re = [ # Hex strings (e.g. commit hashes, checksums) "[a-fA-F0-9]{32,}", # Base64 encoded strings "[A-Za-z0-9+/]{40,}={0,2}", # URLs — avoid flagging path segments "https?://[^\\s]+", # ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.) "fo:[a-z-]+", "xsl-fo-compatible", # Unicode escapes (e.g. \u{0065}) "'\\\\u\\{[0-9a-fA-F]+\\}'", # Foreign language text in test strings and OCR backend language lists '"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"', # GPU runner name "runner-gpu-l4", ] [default.extend-words] # Project-specific terms that are not typos. kreuzberg = "kreuzberg" zensical = "zensical" tesseract = "tesseract" onnx = "onnx" surrealdb = "surrealdb" docling = "docling" markitdown = "markitdown" pymupdf = "pymupdf" openwebui = "openwebui" webui = "webui" wasm = "wasm" mkdocs = "mkdocs" mkdocstrings = "mkdocstrings" rumdl = "rumdl" flate = "flate" tha = "tha" # Domain-specific terms opf = "opf" hocr = "hocr" odf = "odf" # LaTeX environments multline = "multline" hom = "hom" # RTF control words headerr = "headerr" pard = "pard" # DOC format field names edn = "edn" # DOCX measurement units (twips, 50ths of percent, 240ths of line) ths = "ths" # Test data / examples ove = "ove" ges = "ges" caf = "caf" helo = "helo" # Common short variable names / identifiers in code fo = "fo" pn = "pn" thr = "thr" nd = "nd" ba = "ba" iy = "iy" siz = "siz" # Tesseract upstream API spelling extention = "extention" # PDFium upstream constant/function naming portait = "portait" fith = "fith" threed = "threed" chlidren = "chlidren" formated = "formated" specifing = "specifing" # English variants / valid words flagged incorrectly unparseable = "unparseable" # Dutch word in crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs module doc # ("relatie-id" = Dutch for "relation-id", from Microsoft Word broken-image placeholder) relatie = "relatie" # PaddleOCR upstream naming substract = "substract" charater = "charater" # OCR language codes (ISO 639) inh = "inh" bre = "bre" yor = "yor" # English suffix patterns in semantic analysis ment = "ment" # PaddleOCR upstream naming cliper = "cliper" # PDFium upstream doc typos similarily = "similarily" execpt = "execpt" faiure = "faiure" # Tesseract upstream code splitted = "splitted" # Short words flagged in code/data contexts mis = "mis" tre = "tre" ist = "ist" ein = "ein" runner-gpu-l4 = "runner-gpu-l4" l4 = "l4" [default.extend-identifiers] # Allow these identifiers in code PyMuPDF = "PyMuPDF" MarkItDown = "MarkItDown" SurrealDB = "SurrealDB" PDFium = "PDFium" WebUI = "WebUI" traineddata = "traineddata" [files] extend-exclude = [ # Test fixtures and vendor files "test_documents/", "fixtures/", # Lock files "*.lock", "pnpm-lock.yaml", # Build artifacts "target/", "node_modules/", "dist/", "site/", # Code snippets (validated separately by alef-snippets) "docs/snippets/", # Generated files (e2e tests, bindgen) "e2e/", "**/bindgen/", # Stopwords files (foreign language words) "**/stopwords/", # Test data (hOCR samples, etc.) "**/test_data/", # Patch files (upstream diffs) "**/*.diff", "**/*.patch", # PDF text repair tests (intentionally broken text) "**/text_repair.rs", # Changelog (contains intentional examples of garbled text) "CHANGELOG.md", "docs/CHANGELOG.md", # Vendored / third-party code "**/vendor/", "**/vendored/", # Binary files "*.whl", "*.tar.gz", "*.png", "*.ico", "*.svg", ]