# Configuration for the typos spell checker.
# https://github.com/crate-ci/typos

[default]
extend-ignore-re = [
  # Hex strings (e.g. commit hashes, checksums)
  "[a-fA-F0-9]{32,}",
  # Base64 encoded strings
  "[A-Za-z0-9+/]{40,}={0,2}",
  # URLs — avoid flagging path segments
  "https?://[^\\s]+",
  # ODF/XSL-FO namespace prefixes (fo:color, fo:font-size, etc.)
  "fo:[a-z-]+",
  "xsl-fo-compatible",
  # Unicode escapes (e.g. \u{0065})
  "'\\\\u\\{[0-9a-fA-F]+\\}'",
  # Foreign language text in test strings and OCR backend language lists
  '"[^"]*(?:programa|cursos|ist ein|künstliche|excepcional|utiliza|transforme|Exemple|Dies ist|internacional|Hauptstadt)[^"]*"',
  # GPU runner name
  "runner-gpu-l4",
]

[default.extend-words]
# Project-specific terms that are not typos.
kreuzberg = "kreuzberg"
zensical = "zensical"
tesseract = "tesseract"
onnx = "onnx"
surrealdb = "surrealdb"
docling = "docling"
markitdown = "markitdown"
pymupdf = "pymupdf"
openwebui = "openwebui"
webui = "webui"
wasm = "wasm"
mkdocs = "mkdocs"
mkdocstrings = "mkdocstrings"
rumdl = "rumdl"
flate = "flate"
tha = "tha"
# Domain-specific terms
opf = "opf"
hocr = "hocr"
odf = "odf"
# LaTeX environments
multline = "multline"
hom = "hom"
# RTF control words
headerr = "headerr"
pard = "pard"
# DOC format field names
edn = "edn"
# DOCX measurement units (twips, 50ths of percent, 240ths of line)
ths = "ths"
# Test data / examples
ove = "ove"
ges = "ges"
caf = "caf"
helo = "helo"
# Common short variable names / identifiers in code
fo = "fo"
pn = "pn"
thr = "thr"
nd = "nd"
ba = "ba"
iy = "iy"
siz = "siz"
# Tesseract upstream API spelling
extention = "extention"
# PDFium upstream constant/function naming
portait = "portait"
fith = "fith"
threed = "threed"
chlidren = "chlidren"
formated = "formated"
specifing = "specifing"
# English variants / valid words flagged incorrectly
unparseable = "unparseable"
# Dutch word in crates/kreuzberg/tests/pdf_glyph_spacing_issue_962.rs module doc
# ("relatie-id" = Dutch for "relation-id", from Microsoft Word broken-image placeholder)
relatie = "relatie"
# PaddleOCR upstream naming
substract = "substract"
charater = "charater"
# OCR language codes (ISO 639)
inh = "inh"
bre = "bre"
yor = "yor"
# English suffix patterns in semantic analysis
ment = "ment"
# PaddleOCR upstream naming
cliper = "cliper"
# PDFium upstream doc typos
similarily = "similarily"
execpt = "execpt"
faiure = "faiure"
# Tesseract upstream code
splitted = "splitted"
# Short words flagged in code/data contexts
mis = "mis"
tre = "tre"
ist = "ist"
ein = "ein"
runner-gpu-l4 = "runner-gpu-l4"
l4 = "l4"

[default.extend-identifiers]
# Allow these identifiers in code
PyMuPDF = "PyMuPDF"
MarkItDown = "MarkItDown"
SurrealDB = "SurrealDB"
PDFium = "PDFium"
WebUI = "WebUI"
traineddata = "traineddata"

[files]
extend-exclude = [
  # Test fixtures and vendor files
  "test_documents/",
  "fixtures/",
  # Lock files
  "*.lock",
  "pnpm-lock.yaml",
  # Build artifacts
  "target/",
  "node_modules/",
  "dist/",
  "site/",
  # Code snippets (validated separately by alef-snippets)
  "docs/snippets/",
  # Generated files (e2e tests, bindgen)
  "e2e/",
  "**/bindgen/",
  # Stopwords files (foreign language words)
  "**/stopwords/",
  # Test data (hOCR samples, etc.)
  "**/test_data/",
  # Patch files (upstream diffs)
  "**/*.diff",
  "**/*.patch",
  # PDF text repair tests (intentionally broken text)
  "**/text_repair.rs",
  # Changelog (contains intentional examples of garbled text)
  "CHANGELOG.md",
  "docs/CHANGELOG.md",
  # Vendored / third-party code
  "**/vendor/",
  "**/vendored/",
  # Binary files
  "*.whl",
  "*.tar.gz",
  "*.png",
  "*.ico",
  "*.svg",
]