fil/crates/kreuzberg-node/index.d.ts

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
/* eslint-disable */

export type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue };

/**
 * Extract content from multiple byte arrays concurrently.
 *
 * This function processes multiple byte arrays in parallel, automatically managing
 * concurrency to prevent resource exhaustion. The concurrency limit can be
 * configured via `ExtractionConfig::max_concurrent_extractions` or defaults
 * to `(num_cpus * 1.5).ceil()`.
 *
 * Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
 * fields from the batch-level `config`. Pass `None` as the config to use
 * the batch-level defaults for that item.
 * @param items - Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
 *
 * @param config - Batch-level extraction configuration
 *
 * @returns A vector of `ExtractionResult` in the same order as the input items.
 */
export declare function batchExtractBytes(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;

/**
 * Synchronous wrapper for `batch_extract_bytes`.
 *
 * Uses the global Tokio runtime for optimal performance.
 * With the `tokio-runtime` feature, this blocks the current thread using the global
 * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
 * that iterates through items and calls `extract_bytes_sync()`.
 */
export declare function batchExtractBytesSync(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;

/**
 * Extract content from multiple files concurrently.
 *
 * This function processes multiple files in parallel, automatically managing
 * concurrency to prevent resource exhaustion. The concurrency limit can be
 * configured via `ExtractionConfig::max_concurrent_extractions` or defaults
 * to `(num_cpus * 1.5).ceil()`.
 *
 * Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
 * fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
 * Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
 * taken from the batch-level `config`.
 * @param items - Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
 *
 * @param config - Batch-level extraction configuration (provides defaults and batch settings)
 *
 * @returns A vector of `ExtractionResult` in the same order as the input items.
 *
 * @throws Individual file errors are captured in the result metadata. System errors
 * (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
 */
export declare function batchExtractFiles(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;

/**
 * Synchronous wrapper for `batch_extract_files`.
 *
 * Uses the global Tokio runtime for optimal performance.
 * Only available with `tokio-runtime` (WASM has no filesystem).
 */
export declare function batchExtractFilesSync(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;

/**
 * Clear all document extractors from the global registry.
 *
 * Calls `shutdown()` on every registered extractor, then empties the registry.
 * @throws - Any error returned by an extractor's `shutdown()` method. The first error
 *   encountered stops processing of remaining extractors.
 */
export declare function clearDocumentExtractors(): void;

/**
 * Clear all embedding backends from the global registry.
 *
 * Calls `shutdown()` on every registered backend, then empties the registry.
 * @throws - Any error returned by a backend's `shutdown()` method. The first error
 *   encountered stops processing of remaining backends.
 */
export declare function clearEmbeddingBackends(): void;

/**
 * Clear all OCR backends from the global registry.
 *
 * Removes all OCR backends and calls their `shutdown()` methods.
 * @returns - `Ok(())` if all backends were cleared successfully
 * - `Err(...)` if any shutdown method failed
 */
export declare function clearOcrBackends(): void;

/** Remove all registered post-processors. */
export declare function clearPostProcessors(): void;

/**
 * Clear all renderers from the global registry.
 *
 * Removes every renderer, including the built-in defaults (markdown, html,
 * djot, plain). After calling this no renderers are registered; re-register
 * as needed.
 * @throws Returns an error if the registry lock is poisoned.
 */
export declare function clearRenderers(): void;

/** Remove all registered validators. */
export declare function clearValidators(): void;

/**
 * Compare two extraction results and return a structured diff.
 *
 * The comparison is purely structural — no I/O, no side effects. All fields
 * of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
 * @param a - — the "before" extraction result
 *
 * @param b - — the "after" extraction result
 *
 * @param opts - — controls which sections are compared and optional truncation
 */
export declare function compare(a?: ExtractionResult | undefined | null, b?: ExtractionResult | undefined | null, opts?: DiffOptions | undefined | null): ExtractionDiff;

/**
 * Detect the MIME type of a file at the given path.
 *
 * Uses the file extension and optionally the file content to determine the MIME type.
 * Set `check_exists` to `true` to verify the file exists before detection.
 */
export declare function detectMimeType(path: string, checkExists: boolean): string;

/**
 * Detect MIME type from raw file bytes.
 *
 * Uses magic byte signatures to detect file type from content.
 * Falls back to `infer` crate for comprehensive detection.
 *
 * For ZIP-based files, inspects contents to distinguish Office Open XML
 * formats (DOCX, XLSX, PPTX) from plain ZIP archives.
 * @param content - Raw file bytes
 *
 * @returns The detected MIME type string.
 *
 * @throws Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
 */
export declare function detectMimeTypeFromBytes(content: Uint8Array): string;

/**
 * Embed a list of texts using the configured embedding model.
 *
 * Returns a 2D vector where each inner vector is the embedding for the corresponding text.
 */
export declare function embedTexts(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Array<Array<number>>;

/**
 * Generate embeddings asynchronously for a list of text strings.
 *
 * This is the async counterpart to [`embed_texts`]. It offloads the blocking
 * ONNX inference work to a dedicated blocking thread pool via Tokio's
 * `spawn_blocking`, keeping the async executor free.
 *
 * Returns one embedding vector per input text in the same order.
 * @param texts - Vec of strings to embed (owned, sent to blocking thread)
 *
 * @param config - Embedding configuration specifying model, batch size, and normalization
 *
 * @throws - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
 * - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
 *   or the blocking inference task panics
 */
export declare function embedTextsAsync(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Promise<Array<Array<number>>>;

/**
 * Extract content from a byte array.
 *
 * This is the main entry point for in-memory extraction. It performs the following steps:
 * 1. Validate MIME type
 * 2. Handle legacy format conversion if needed
 * 3. Select appropriate extractor from registry
 * 4. Extract content
 * 5. Run post-processing pipeline
 * @param content - The byte array to extract
 *
 * @param mime_type - MIME type of the content
 *
 * @param config - Extraction configuration
 *
 * @returns An `ExtractionResult` containing the extracted content and metadata.
 *
 * @throws Returns `KreuzbergError::Validation` if MIME type is invalid.
 * Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
 */
export declare function extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;

/**
 * Synchronous wrapper for `extract_bytes`.
 *
 * Uses the global Tokio runtime for 100x+ performance improvement over creating
 * a new runtime per call.
 *
 * With the `tokio-runtime` feature, this blocks the current thread using the global
 * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
 */
export declare function extractBytesSync(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): ExtractionResult;

/**
 * Extract content from a file.
 *
 * This is the main entry point for file-based extraction. It performs the following steps:
 * 1. Check cache for existing result (if caching enabled)
 * 2. Detect or validate MIME type
 * 3. Select appropriate extractor from registry
 * 4. Extract content
 * 5. Run post-processing pipeline
 * 6. Store result in cache (if caching enabled)
 * @param path - Path to the file to extract
 *
 * @param mime_type - Optional MIME type override. If None, will be auto-detected
 *
 * @param config - Extraction configuration
 *
 * @returns An `ExtractionResult` containing the extracted content and metadata.
 *
 * @throws Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
 * Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
 */
export declare function extractFile(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;

/**
 * Synchronous wrapper for `extract_file`.
 *
 * This is a convenience function that blocks the current thread until extraction completes.
 * For async code, use `extract_file` directly.
 *
 * Uses the global Tokio runtime for 100x+ performance improvement over creating
 * a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
 *
 * This function is only available with the `tokio-runtime` feature. For WASM targets,
 * use a truly synchronous extraction approach instead.
 */
export declare function extractFileSync(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): ExtractionResult;

/**
 * Get an embedding preset by name.
 *
 * Returns `None` if no preset with the given name exists. Returns an owned
 * clone so the value is safe to pass across FFI boundaries.
 */
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;

/**
 * Get file extensions for a given MIME type.
 *
 * Returns all known file extensions that map to the specified MIME type.
 * @param mime_type - The MIME type to look up
 *
 * @returns A vector of file extensions (without leading dot) for the MIME type.
 */
export declare function getExtensionsForMime(mimeType: string): Array<string>;

/**
 * Hardware acceleration configuration for ONNX Runtime models.
 *
 * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
 * for inference in layout detection and embedding generation.
 */
export interface AccelerationConfig {
  /** Execution provider to use for ONNX inference. */
  readonly provider?: ExecutionProviderType
  /** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
  readonly deviceId?: number
}

/** Types of inline text annotations. */
export type AnnotationKind =
  | { annotation_type: 'bold' }
  | { annotation_type: 'italic' }
  | { annotation_type: 'underline' }
  | { annotation_type: 'strikethrough' }
  | { annotation_type: 'code' }
  | { annotation_type: 'subscript' }
  | { annotation_type: 'superscript' }
  | { annotation_type: 'link'; url: string; title: string }
  | { annotation_type: 'highlight' }
  | { annotation_type: 'color'; value: string }
  | { annotation_type: 'font_size'; value: string }
  | { annotation_type: 'custom'; name: string; value: string }

/**
 * A single file extracted from an archive.
 *
 * When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
 * enabled, each processable file produces its own full `ExtractionResult`.
 */
export interface ArchiveEntry {
  /** Archive-relative file path (e.g. "folder/document.pdf"). */
  readonly path: string
  /** Detected MIME type of the file. */
  readonly mimeType: string
  /** Full extraction result for this file. */
  readonly result: ExtractionResult
}

/**
 * Archive (ZIP/TAR/7Z) metadata.
 *
 * Extracted from compressed archive files containing file lists and size information.
 */
export interface ArchiveMetadata {
  /** Archive format ("ZIP", "TAR", "7Z", etc.) */
  readonly format?: string
  /** Total number of files in the archive */
  readonly fileCount?: number
  /** List of file paths within the archive */
  readonly fileList?: Array<string>
  /** Total uncompressed size in bytes */
  readonly totalSize?: number
  /** Compressed size in bytes (if available) */
  readonly compressedSize?: number
}

/**
 * Batch item for byte array extraction.
 *
 * Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
 * to represent a single item in a batch extraction job.
 */
export interface BatchBytesItem {
  /** The content bytes to extract from */
  readonly content: Uint8Array
  /** MIME type of the content (e.g., "application/pdf", "text/html") */
  readonly mimeType: string
  /** Per-item configuration overrides (None uses batch-level defaults) */
  readonly config?: FileExtractionConfig
}

/**
 * Batch item for file extraction.
 *
 * Used with `batch_extract_files` and `batch_extract_files_sync`
 * to represent a single file in a batch extraction job.
 */
export interface BatchFileItem {
  /** Path to the file to extract from */
  readonly path: string
  /** Per-file configuration overrides (None uses batch-level defaults) */
  readonly config?: FileExtractionConfig
}

/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
export interface BBox {
  readonly x1: number
  readonly y1: number
  readonly x2: number
  readonly y2: number
}

/** BibTeX bibliography metadata. */
export interface BibtexMetadata {
  /** Number of entries in the bibliography. */
  readonly entryCount?: number
  readonly citationKeys?: Array<string>
  readonly authors?: Array<string>
  readonly yearRange?: YearRange
  readonly entryTypes?: Record<string, number>
}

/** Types of block-level elements in Djot. */
export declare enum BlockType {
  Paragraph = "paragraph",
  Heading = "heading",
  Blockquote = "blockquote",
  CodeBlock = "code_block",
  ListItem = "list_item",
  OrderedList = "ordered_list",
  BulletList = "bullet_list",
  TaskList = "task_list",
  DefinitionList = "definition_list",
  DefinitionTerm = "definition_term",
  DefinitionDescription = "definition_description",
  Div = "div",
  Section = "section",
  ThematicBreak = "thematic_break",
  RawBlock = "raw_block",
  MathDisplay = "math_display",
}

/** Bounding box coordinates for element positioning. */
export interface BoundingBox {
  /** Left x-coordinate */
  readonly x0?: number
  /** Bottom y-coordinate */
  readonly y0?: number
  /** Right x-coordinate */
  readonly x1?: number
  /** Top y-coordinate */
  readonly y1?: number
}

export interface CacheStats {
  readonly totalFiles: number
  readonly totalSizeMb: number
  readonly availableSpaceMb: number
  readonly oldestFileAgeDays: number
  readonly newestFileAgeDays: number
}

/**
 * A single changed cell within a table.
 *
 * Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate::diff` re-exports this type verbatim.
 */
export interface CellChange {
  /** Zero-based row index. */
  readonly row: number
  /** Zero-based column index. */
  readonly col: number
  /** Value before the change. */
  readonly from: string
  /** Value after the change. */
  readonly to: string
}

/**
 * A text chunk with optional embedding and metadata.
 *
 * Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
 * contains the text content, optional embedding vector (if embedding generation
 * is configured), and metadata about its position in the document.
 */
export interface Chunk {
  /** The text content of this chunk. */
  readonly content: string
  /**
   * Semantic structural classification of this chunk.
   *
   * Assigned by the heuristic classifier based on content patterns and
   * heading context. Defaults to `ChunkType::Unknown` when no rule matches.
   */
  readonly chunkType: ChunkType
  /**
   * Optional embedding vector for this chunk.
   *
   * Only populated when `EmbeddingConfig` is provided in chunking configuration.
   * The dimensionality depends on the chosen embedding model.
   */
  readonly embedding?: Array<number>
  /** Metadata about this chunk's position and properties. */
  readonly metadata: ChunkMetadata
}

/**
 * Type of text chunker to use.
 *
 * # Variants
 *
 * * `Text` - Generic text splitter, splits on whitespace and punctuation
 * * `Markdown` - Markdown-aware splitter, preserves formatting and structure
 * * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
 * * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
 *   embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
 *   lower = more splits). Without an embedding, falls back to a
 *   structural-boundary heuristic (ALL-CAPS headers, numbered sections,
 *   blank-line paragraphs) and merges groups into chunks capped at
 *   `max_characters` (default 1000). `topic_threshold` has no effect in the
 *   fallback path. For best results, pair with an embedding model.
 */
export declare enum ChunkerType {
  Text = "text",
  Markdown = "markdown",
  Yaml = "yaml",
  Semantic = "semantic",
}

/**
 * Chunking configuration.
 *
 * Configures text chunking for document content, including chunk size,
 * overlap, trimming behavior, and optional embeddings.
 *
 * Use `..Default::default()` when constructing to allow for future field additions:
 * ```rust
 * let config = ChunkingConfig {
 *     max_characters: 500,
 *     ..Default::default()
 * };
 * ```
 */
export interface ChunkingConfig {
  /**
   * Maximum size per chunk (in units determined by `sizing`).
   *
   * When `sizing` is `Characters` (default), this is the max character count.
   * When using token-based sizing, this is the max token count.
   *
   * Default: 1000
   */
  readonly maxCharacters?: number
  /**
   * Overlap between chunks (in units determined by `sizing`).
   *
   * Default: 200
   */
  readonly overlap?: number
  /**
   * Whether to trim whitespace from chunk boundaries.
   *
   * Default: true
   */
  readonly trim?: boolean
  /**
   * Type of chunker to use (Text or Markdown).
   *
   * Default: Text
   */
  readonly chunkerType?: ChunkerType
  /** Optional embedding configuration for chunk embeddings. */
  readonly embedding?: EmbeddingConfig
  /** Use a preset configuration (overrides individual settings if provided). */
  readonly preset?: string
  /**
   * How to measure chunk size.
   *
   * Default: `Characters` (Unicode character count).
   * Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
   */
  readonly sizing?: ChunkSizing
  /**
   * When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
   * path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
   *
   * This is useful for RAG pipelines where each chunk needs self-contained
   * context about its position in the document structure.
   *
   * Default: `false`
   */
  readonly prependHeadingContext?: boolean
  /**
   * Optional cosine similarity threshold for semantic topic boundary detection.
   *
   * Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
   * provided. You almost never need to set this. When omitted, defaults to
   * `0.75` which works well for most documents. Lower values detect more
   * topic boundaries (more, smaller chunks); higher values detect fewer.
   * Range: `0.0..=1.0`.
   */
  readonly topicThreshold?: number
}

/** Metadata about a chunk's position in the original document. */
export interface ChunkMetadata {
  /** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
  readonly byteStart: number
  /** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
  readonly byteEnd: number
  /**
   * Number of tokens in this chunk (if available).
   *
   * This is calculated by the embedding model's tokenizer if embeddings are enabled.
   */
  readonly tokenCount?: number
  /** Zero-based index of this chunk in the document. */
  readonly chunkIndex: number
  /** Total number of chunks in the document. */
  readonly totalChunks: number
  /**
   * First page number this chunk spans (1-indexed).
   *
   * Only populated when page tracking is enabled in extraction configuration.
   */
  readonly firstPage?: number
  /**
   * Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
   *
   * Only populated when page tracking is enabled in extraction configuration.
   */
  readonly lastPage?: number
  /**
   * Heading context when using Markdown chunker.
   *
   * Contains the heading hierarchy this chunk falls under.
   * Only populated when `ChunkerType::Markdown` is used.
   */
  readonly headingContext?: HeadingContext
  /**
   * Indices into `ExtractionResult.images` for images on pages covered by this chunk.
   *
   * Contains zero-based indices into the top-level `images` collection for every
   * image whose `page_number` falls within `[first_page, last_page]`.
   * Empty when image extraction is disabled or the chunk spans no pages with images.
   */
  readonly imageIndices: Array<number>
}

/**
 * How chunk size is measured.
 *
 * Defaults to `Characters` (Unicode character count). When using token-based sizing,
 * chunks are sized by token count according to the specified tokenizer.
 *
 * Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
 * available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
 * (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
 */
export type ChunkSizing =
  | { type: 'characters' }
  | { type: 'tokenizer'; model: string; cacheDir: string }

/**
 * Semantic structural classification of a text chunk.
 *
 * Assigned by the heuristic classifier in `chunking::classifier`.
 * Defaults to `Unknown` when no rule matches.
 * Designed to be extended in future versions without breaking changes.
 */
export declare enum ChunkType {
  /** Section heading or document title. */
  Heading = "heading",
  /** Party list: names, addresses, and signatories. */
  PartyList = "party_list",
  /** Definition clause ("X means…", "X shall mean…"). */
  Definitions = "definitions",
  /** Operative clause containing legal/contractual action verbs. */
  OperativeClause = "operative_clause",
  /** Signature block with signatures, names, and dates. */
  SignatureBlock = "signature_block",
  /** Schedule, annex, appendix, or exhibit section. */
  Schedule = "schedule",
  /** Table-like content with aligned columns or repeated patterns. */
  TableLike = "table_like",
  /** Mathematical formula or equation. */
  Formula = "formula",
  /** Code block or preformatted content. */
  CodeBlock = "code_block",
  /** Embedded or referenced image content. */
  Image = "image",
  /** Organizational chart or hierarchy diagram. */
  OrgChart = "org_chart",
  /** Diagram, figure, or visual illustration. */
  Diagram = "diagram",
  /** Unclassified or mixed content. */
  Unknown = "unknown",
}

/** Citation file metadata (RIS, PubMed, EndNote). */
export interface CitationMetadata {
  readonly citationCount?: number
  readonly format?: string
  readonly authors?: Array<string>
  readonly yearRange?: YearRange
  readonly dois?: Array<string>
  readonly keywords?: Array<string>
}

/**
 * Content rendering mode for code extraction.
 *
 * Controls how extracted code content is represented in the `content` field
 * of `ExtractionResult`.
 */
export declare enum CodeContentMode {
  /** Use TSLP semantic chunks as content (default). */
  Chunks = "chunks",
  /** Use raw source code as content. */
  Raw = "raw",
  /** Emit function/class headings + docstrings (no code bodies). */
  Structure = "structure",
}

/**
 * Cross-extractor content filtering configuration.
 *
 * Controls whether "furniture" content (headers, footers, page numbers,
 * watermarks, repeating text) is included in or stripped from extraction
 * results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
 * with format-specific implementation.
 *
 * When `None` on `ExtractionConfig`, each extractor uses its current
 * default behavior unchanged.
 */
export interface ContentFilterConfig {
  /**
   * Include running headers in extraction output.
   *
   * - PDF: Disables top-margin furniture stripping and prevents the layout
   *   model from treating `PageHeader`-classified regions as furniture.
   * - DOCX: Includes document headers in text output.
   * - RTF/ODT: Headers already included; this is a no-op when true.
   * - HTML/EPUB: Keeps `<header>` element content.
   *
   * Default: `false` (headers are stripped or excluded).
   */
  readonly includeHeaders?: boolean
  /**
   * Include running footers in extraction output.
   *
   * - PDF: Disables bottom-margin furniture stripping and prevents the layout
   *   model from treating `PageFooter`-classified regions as furniture.
   * - DOCX: Includes document footers in text output.
   * - RTF/ODT: Footers already included; this is a no-op when true.
   * - HTML/EPUB: Keeps `<footer>` element content.
   *
   * Default: `false` (footers are stripped or excluded).
   */
  readonly includeFooters?: boolean
  /**
   * Enable the heuristic cross-page repeating text detector.
   *
   * When `true` (default), text that repeats verbatim across a supermajority
   * of pages is classified as furniture and stripped.  Disable this if brand
   * names or repeated headings are being incorrectly removed by the heuristic.
   *
   * Note: when a layout-detection model is active, the model may independently
   * classify page-header / page-footer regions as furniture on a per-page basis.
   * To preserve those regions, set `include_headers = true`, `include_footers = true`,
   * or both, in addition to disabling this flag.
   *
   * Primarily affects PDF extraction.
   *
   * Default: `true`.
   */
  readonly stripRepeatingText?: boolean
  /**
   * Include watermark text in extraction output.
   *
   * - PDF: Keeps watermark artifacts and arXiv identifiers.
   * - Other formats: No effect currently.
   *
   * Default: `false` (watermarks are stripped).
   */
  readonly includeWatermarks?: boolean
}

/**
 * Content layer classification for document nodes.
 *
 * Replaces separate body/furniture arrays with per-node granularity.
 */
export declare enum ContentLayer {
  /** Main document body content. */
  Body = "body",
  /** Page/section header (running header). */
  Header = "header",
  /** Page/section footer (running footer). */
  Footer = "footer",
  /** Footnote content. */
  Footnote = "footnote",
}

/** JATS contributor with role. */
export interface ContributorRole {
  readonly name: string
  readonly role?: string
}

/**
 * Dublin Core metadata from docProps/core.xml
 *
 * Contains standard metadata fields defined by the Dublin Core standard
 * and Office-specific extensions.
 */
export interface CoreProperties {
  /** Document title */
  readonly title?: string
  /** Document subject/topic */
  readonly subject?: string
  /** Document creator/author */
  readonly creator?: string
  /** Keywords or tags */
  readonly keywords?: string
  /** Document description/abstract */
  readonly description?: string
  /** User who last modified the document */
  readonly lastModifiedBy?: string
  /** Revision number */
  readonly revision?: string
  /** Creation timestamp (ISO 8601) */
  readonly created?: string
  /** Last modification timestamp (ISO 8601) */
  readonly modified?: string
  /** Document category */
  readonly category?: string
  /** Content status (Draft, Final, etc.) */
  readonly contentStatus?: string
  /** Document language */
  readonly language?: string
  /** Unique identifier */
  readonly identifier?: string
  /** Document version */
  readonly version?: string
  /** Last print timestamp (ISO 8601) */
  readonly lastPrinted?: string
}

/** CSV/TSV file metadata. */
export interface CsvMetadata {
  readonly rowCount?: number
  readonly columnCount?: number
  readonly delimiter?: string
  readonly hasHeader?: boolean
  readonly columnTypes?: Array<string>
}

/** dBASE field information. */
export interface DbfFieldInfo {
  readonly name: string
  readonly fieldType: string
}

/** dBASE (DBF) file metadata. */
export interface DbfMetadata {
  readonly recordCount?: number
  readonly fieldCount?: number
  readonly fields?: Array<DbfFieldInfo>
}

/** Page-level detection result containing all detections and page metadata. */
export interface DetectionResult {
  readonly pageWidth: number
  readonly pageHeight: number
  readonly detections: Array<LayoutDetection>
}

/** MIME type detection response. */
export interface DetectResponse {
  /** Detected MIME type */
  readonly mimeType: string
  /** Original filename (if provided) */
  readonly filename?: string
}

/** A single contiguous hunk in a unified diff. */
export interface DiffHunk {
  /** Starting line number in the old content (0-indexed). */
  readonly fromLine: number
  /** Number of lines from the old content in this hunk. */
  readonly fromCount: number
  /** Starting line number in the new content (0-indexed). */
  readonly toLine: number
  /** Number of lines from the new content in this hunk. */
  readonly toCount: number
  /** Lines that make up this hunk. */
  readonly lines: Array<DiffLine>
}

/**
 * A single line in a unified-diff hunk.
 *
 * Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate::diff` re-exports this type verbatim.
 */
export type DiffLine =
  | { kind: 'context'; 0: string }
  | { kind: 'added'; 0: string }
  | { kind: 'removed'; 0: string }

/** Options controlling how two `ExtractionResult` values are compared. */
export interface DiffOptions {
  /** Include metadata changes in the diff. Default: `true`. */
  readonly includeMetadata?: boolean
  /** Include embedded-children changes in the diff. Default: `true`. */
  readonly includeEmbedded?: boolean
  /**
   * Truncate content to this many characters before diffing.
   *
   * Useful for very large documents where only the first N characters matter.
   * `None` means no truncation.
   */
  readonly maxContentChars?: number
}

/**
 * Comprehensive Djot document structure with semantic preservation.
 *
 * This type captures the full richness of Djot markup, including:
 * - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
 * - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
 * - Attributes (classes, IDs, key-value pairs)
 * - Links, images, footnotes
 * - Math expressions (inline and display)
 * - Tables with full structure
 *
 * Available when the `djot` feature is enabled.
 */
export interface DjotContent {
  /** Plain text representation for backwards compatibility */
  readonly plainText: string
  /** Structured block-level content */
  readonly blocks: Array<FormattedBlock>
  /** Metadata from YAML frontmatter */
  readonly metadata: Metadata
  /** Extracted tables as structured data */
  readonly tables: Array<Table>
  /** Extracted images with metadata */
  readonly images: Array<DjotImage>
  /** Extracted links with URLs */
  readonly links: Array<DjotLink>
  /** Footnote definitions */
  readonly footnotes: Array<Footnote>
  /** Attributes mapped by element identifier (if present) */
  readonly attributes: Array<string>
}

/** Image element in Djot. */
export interface DjotImage {
  /** Image source URL or path */
  readonly src: string
  /** Alternative text */
  readonly alt: string
  /** Optional title */
  readonly title?: string
  /** Element attributes */
  readonly attributes?: string
}

/** Link element in Djot. */
export interface DjotLink {
  /** Link URL */
  readonly url: string
  /** Link text content */
  readonly text: string
  /** Optional title */
  readonly title?: string
  /** Element attributes */
  readonly attributes?: string
}

/**
 * Trait for document extractor plugins.
 *
 * Implement this trait to add support for new document formats or to override
 * built-in extraction behavior with custom logic.
 *
 * # Return Type
 *
 * Extractors return `InternalDocument`, a flat intermediate representation.
 * The pipeline converts this into the public `ExtractionResult` via the
 * derivation step.
 *
 * # Priority System
 *
 * When multiple extractors support the same MIME type, the registry selects
 * the extractor with the highest priority value. Use this to:
 * - Override built-in extractors (priority > 50)
 * - Provide fallback extractors (priority < 50)
 * - Implement specialized extractors for specific use cases
 *
 * Default priority is 50.
 *
 * # Thread Safety
 *
 * Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
 */
export interface DocumentExtractor {
  name(): string
  /**
   * Extract content from a byte array.
   *
   * This is the core extraction method that processes in-memory document data.
   * @param content - Raw document bytes
   *
   * @param mime_type - MIME type of the document (already validated)
   *
   * @param config - Extraction configuration
   *
   * @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
   * The pipeline will convert this into the public `ExtractionResult`.
   *
   * @throws - `KreuzbergError::Parsing` - Document parsing failed
   * - `KreuzbergError::Validation` - Invalid document structure
   * - `KreuzbergError::Io` - I/O errors (these always bubble up)
   * - `KreuzbergError::MissingDependency` - Required dependency not available
   */
  extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
  /**
   * Extract content from a file.
   *
   * Default implementation reads the file and calls `extract_bytes`.
   * Override for custom file handling, streaming, or memory optimizations.
   * @param path - Path to the document file
   *
   * @param mime_type - MIME type of the document (already validated)
   *
   * @param config - Extraction configuration
   *
   * @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
   *
   * @throws Same as `extract_bytes`, plus file I/O errors.
   */
  extractFile?(path: string, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
  /**
   * Get the list of MIME types supported by this extractor.
   *
   * Can include exact MIME types and prefix patterns:
   * - Exact: `"application/pdf"`, `"text/plain"`
   * - Prefix: `"image/*"` (matches any image type)
   * @returns A slice of MIME type strings.
   */
  supportedMimeTypes(): string
  /**
   * Get the priority of this extractor.
   *
   * Higher priority extractors are preferred when multiple extractors
   * support the same MIME type.
   *
   * # Priority Guidelines
   *
   * - **0-25**: Fallback/low-quality extractors
   * - **26-49**: Alternative extractors
   * - **50**: Default priority (built-in extractors)
   * - **51-75**: Premium/enhanced extractors
   * - **76-100**: Specialized/high-priority extractors
   * @returns Priority value (default: 50)
   */
  priority?(): string
  /**
   * Optional: Check if this extractor can handle a specific file.
   *
   * Allows for more sophisticated detection beyond MIME types.
   * Defaults to `true` (rely on MIME type matching).
   * @param path - Path to the file to check
   *
   * @param mime_type - Detected MIME type
   *
   * @returns `true` if the extractor can handle this file, `false` otherwise.
   */
  canHandle?(path: string, mimeType: string): string
}

/**
 * A single node in the document tree.
 *
 * Each node has deterministic `id`, typed `content`, optional `parent`/`children`
 * for tree structure, and metadata like page number, bounding box, and content layer.
 */
export interface DocumentNode {
  /** Deterministic identifier (hash of content + position). */
  readonly id: string
  /** Node content — tagged enum, type-specific data only. */
  readonly content: NodeContent
  /** Parent node index (`None` = root-level node). */
  readonly parent?: number
  /** Child node indices in reading order. */
  readonly children: Array<number>
  /** Content layer classification. */
  readonly contentLayer: ContentLayer
  /** Page number where this node starts (1-indexed). */
  readonly page?: number
  /** Page number where this node ends (for multi-page tables/sections). */
  readonly pageEnd?: number
  /** Bounding box in document coordinates. */
  readonly bbox?: BoundingBox
  /**
   * Inline annotations (formatting, links) on this node's text content.
   *
   * Only meaningful for text-carrying nodes; empty for containers.
   */
  readonly annotations: Array<TextAnnotation>
  /**
   * Format-specific key-value attributes.
   *
   * Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
   * LaTeX environment names, Excel cell formulas, slide layout names, etc.
   */
  readonly attributes?: Record<string, string>
}

/** A resolved relationship between two nodes in the document tree. */
export interface DocumentRelationship {
  /** Source node index (the referencing node). */
  readonly source: number
  /** Target node index (the referenced node). */
  readonly target: number
  /** Semantic kind of the relationship. */
  readonly kind: RelationshipKind
}

/**
 * A single tracked change embedded in a document.
 *
 * Populated by per-format extractors that understand change-tracking metadata
 * (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
 * extractor defaults to `ExtractionResult.revisions = None` until a
 * format-specific implementation is added.
 */
export interface DocumentRevision {
  /**
   * Format-specific revision identifier.
   *
   * For DOCX this is the `w:id` attribute value on the change element
   * (e.g. `"42"`). When the attribute is absent a synthetic fallback is
   * generated (`"docx-ins-0"`, `"docx-del-3"`, …).
   */
  readonly revisionId: string
  /** Display name of the author who made this change, when available. */
  readonly author?: string
  /**
   * ISO-8601 timestamp of the change, when available.
   *
   * Stored as a plain string so this type remains FFI-friendly and
   * unconditionally available without the `chrono` optional dep.
   * DOCX populates this from the `w:date` attribute (e.g.
   * `"2024-03-15T10:30:00Z"`).
   */
  readonly timestamp?: string
  /** Semantic kind of this revision. */
  readonly kind: RevisionKind
  /**
   * Best-effort document location for this revision.
   *
   * Resolution is format-dependent and may be `None` when the location
   * cannot be determined (e.g. changes inside table cells before
   * table-cell anchor support is added).
   */
  readonly anchor?: RevisionAnchor
  /** The content changes that make up this revision. */
  readonly delta: RevisionDelta
}

/**
 * Top-level structured document representation.
 *
 * A flat array of nodes with index-based parent/child references forming a tree.
 * Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
 * to iterate over top-level content by layer.
 *
 * # Validation
 *
 * Call `validate()` after construction to verify all node indices are in bounds
 * and parent-child relationships are bidirectionally consistent.
 */
export interface DocumentStructure {
  /** All nodes in document/reading order. */
  readonly nodes?: Array<DocumentNode>
  /**
   * Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
   *
   * Allows renderers to apply format-aware heuristics when converting
   * the document tree to output formats.
   */
  readonly sourceFormat?: string
  /**
   * Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
   *
   * Populated during derivation from the internal document representation.
   * Empty when no relationships are detected.
   */
  readonly relationships?: Array<DocumentRelationship>
  /**
   * Sorted, deduplicated list of node type names present in this document.
   *
   * Each value is the snake_case `node_type` tag of the corresponding
   * [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
   *
   * Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
   * Empty until that method is called (internal construction paths call it
   * at the end of derivation).
   */
  readonly nodeTypes?: Array<string>
}

/**
 * Application properties from docProps/app.xml for DOCX
 *
 * Contains Word-specific document statistics and metadata.
 */
export interface DocxAppProperties {
  /** Application name (e.g., "Microsoft Office Word") */
  readonly application?: string
  /** Application version */
  readonly appVersion?: string
  /** Template filename */
  readonly template?: string
  /** Total editing time in minutes */
  readonly totalTime?: number
  /** Number of pages */
  readonly pages?: number
  /** Number of words */
  readonly words?: number
  /** Number of characters (excluding spaces) */
  readonly characters?: number
  /** Number of characters (including spaces) */
  readonly charactersWithSpaces?: number
  /** Number of lines */
  readonly lines?: number
  /** Number of paragraphs */
  readonly paragraphs?: number
  /** Company name */
  readonly company?: string
  /** Document security level */
  readonly docSecurity?: number
  /** Scale crop flag */
  readonly scaleCrop?: boolean
  /** Links up to date flag */
  readonly linksUpToDate?: boolean
  /** Shared document flag */
  readonly sharedDoc?: boolean
  /** Hyperlinks changed flag */
  readonly hyperlinksChanged?: boolean
}

/**
 * Word document metadata.
 *
 * Extracted from DOCX files using shared Office Open XML metadata extraction.
 * Integrates with `office_metadata` module for core/app/custom properties.
 */
export interface DocxMetadata {
  /**
   * Core properties from docProps/core.xml (Dublin Core metadata)
   *
   * Contains title, creator, subject, keywords, dates, etc.
   * Shared format across DOCX/PPTX/XLSX documents.
   */
  readonly coreProperties?: CoreProperties
  /**
   * Application properties from docProps/app.xml (Word-specific statistics)
   *
   * Contains word count, page count, paragraph count, editing time, etc.
   * DOCX-specific variant of Office application properties.
   */
  readonly appProperties?: DocxAppProperties
  /**
   * Custom properties from docProps/custom.xml (user-defined properties)
   *
   * Contains key-value pairs defined by users or applications.
   * Values can be strings, numbers, booleans, or dates.
   */
  readonly customProperties?: Record<string, JsonValue>
}

/**
 * Semantic element extracted from document.
 *
 * Represents a logical unit of content with semantic classification,
 * unique identifier, and metadata for tracking origin and position.
 */
export interface Element {
  /** Unique element identifier */
  readonly elementId: string
  /** Semantic type of this element */
  readonly elementType: ElementType
  /** Text content of the element */
  readonly text: string
  /** Metadata about the element */
  readonly metadata: ElementMetadata
}

/** Metadata for a semantic element. */
export interface ElementMetadata {
  /** Page number (1-indexed) */
  readonly pageNumber?: number
  /** Source filename or document name */
  readonly filename?: string
  /** Bounding box coordinates if available */
  readonly coordinates?: BoundingBox
  /** Position index in the element sequence */
  readonly elementIndex?: number
  /** Additional custom metadata */
  readonly additional: Record<string, string>
}

/**
 * Semantic element type classification.
 *
 * Categorizes text content into semantic units for downstream processing.
 * Supports the element types commonly found in Unstructured documents.
 */
export declare enum ElementType {
  /** Document title */
  Title = "title",
  /** Main narrative text body */
  NarrativeText = "narrative_text",
  /** Section heading */
  Heading = "heading",
  /** List item (bullet, numbered, etc.) */
  ListItem = "list_item",
  /** Table element */
  Table = "table",
  /** Image element */
  Image = "image",
  /** Page break marker */
  PageBreak = "page_break",
  /** Code block */
  CodeBlock = "code_block",
  /** Block quote */
  BlockQuote = "block_quote",
  /** Footer text */
  Footer = "footer",
  /** Header text */
  Header = "header",
}

/**
 * Email attachment representation.
 *
 * Contains metadata and optionally the content of an email attachment.
 */
export interface EmailAttachment {
  /** Attachment name (from Content-Disposition header) */
  readonly name?: string
  /** Filename of the attachment */
  readonly filename?: string
  /** MIME type of the attachment */
  readonly mimeType?: string
  /** Size in bytes */
  readonly size?: number
  /** Whether this attachment is an image */
  readonly isImage: boolean
  /**
   * Attachment data (if extracted).
   * Uses `bytes::Bytes` for cheap cloning of large buffers.
   */
  readonly data?: Uint8Array
}

/** Configuration for email extraction. */
export interface EmailConfig {
  /**
   * Windows codepage number to use when an MSG file contains no codepage property.
   * Defaults to `None`, which falls back to windows-1252.
   *
   * If an unrecognized or invalid codepage number is supplied (including 0),
   * the behavior silently falls back to windows-1252 — the same as when the
   * MSG file itself contains an unrecognized codepage. No error or warning is
   * emitted. Users should verify output when supplying unusual values.
   *
   * Common values:
   * - 1250: Central European (Polish, Czech, Hungarian, etc.)
   * - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
   * - 1252: Western European (default)
   * - 1253: Greek
   * - 1254: Turkish
   * - 1255: Hebrew
   * - 1256: Arabic
   * - 932:  Japanese (Shift-JIS)
   * - 936:  Simplified Chinese (GBK)
   */
  readonly msgFallbackCodepage?: number
}

/**
 * Email extraction result.
 *
 * Complete representation of an extracted email message (.eml or .msg)
 * including headers, body content, and attachments.
 */
export interface EmailExtractionResult {
  /** Email subject line */
  readonly subject?: string
  /** Sender email address */
  readonly fromEmail?: string
  /** Primary recipient email addresses */
  readonly toEmails: Array<string>
  /** CC recipient email addresses */
  readonly ccEmails: Array<string>
  /** BCC recipient email addresses */
  readonly bccEmails: Array<string>
  /** Email date/timestamp */
  readonly date?: string
  /** Message-ID header value */
  readonly messageId?: string
  /** Plain text version of the email body */
  readonly plainText?: string
  /** HTML version of the email body */
  readonly htmlContent?: string
  /** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
  readonly content: string
  /** List of email attachments */
  readonly attachments: Array<EmailAttachment>
  /** Additional email headers and metadata */
  readonly metadata: Record<string, string>
}

/**
 * Email metadata extracted from .eml and .msg files.
 *
 * Includes sender/recipient information, message ID, and attachment list.
 */
export interface EmailMetadata {
  /** Sender's email address */
  readonly fromEmail?: string
  /** Sender's display name */
  readonly fromName?: string
  /** Primary recipients */
  readonly toEmails?: Array<string>
  /** CC recipients */
  readonly ccEmails?: Array<string>
  /** BCC recipients */
  readonly bccEmails?: Array<string>
  /** Message-ID header value */
  readonly messageId?: string
  /** List of attachment filenames */
  readonly attachments?: Array<string>
}

/** Changes to embedded archive children between two results. */
export interface EmbeddedChanges {
  /** Children present in `b` but not in `a` (matched by `path`). */
  readonly added: Array<ArchiveEntry>
  /** Children present in `a` but not in `b` (matched by `path`). */
  readonly removed: Array<ArchiveEntry>
  /**
   * Children present in both but with differing content (matched by `path`).
   *
   * Each entry holds the diff of the nested `ExtractionResult`.
   */
  readonly changed: Array<EmbeddedDiff>
}

/** Diff for a single embedded archive entry that appears in both results. */
export interface EmbeddedDiff {
  /** Archive-relative path identifying this entry. */
  readonly path: string
  /** The recursive diff of the entry's extraction result. */
  readonly diff: ExtractionDiff
}

/** Embedded file descriptor extracted from the PDF name tree. */
export interface EmbeddedFile {
  /** The filename as stored in the PDF name tree. */
  readonly name: string
  /** Raw file bytes from the embedded stream (already decompressed by lopdf). */
  readonly data: Uint8Array
  /**
   * Compressed byte count of the original stream (before decompression).
   *
   * Used by callers to compute the decompression ratio and detect zip-bomb-style
   * attacks that embed a tiny compressed stream expanding to gigabytes of data.
   */
  readonly compressedSize: number
  /** MIME type if specified in the filespec, otherwise `None`. */
  readonly mimeType?: string
}

/**
 * Trait for in-process embedding backend plugins.
 *
 * Async to match the convention used by `OcrBackend`,
 * `DocumentExtractor`, and `PostProcessor`.
 * Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
 * C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
 * equivalent to satisfy the async signature.
 *
 * # Thread safety
 *
 * Backends must be `Send + Sync + 'static`. They are stored in
 * `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
 * pipeline. If the backend's underlying model isn't thread-safe, the backend
 * itself must serialize access internally (e.g. via `Mutex<Inner>`).
 *
 * # Contract
 *
 * - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
 *   `self.dimensions()`. The dispatcher in `embed_texts`
 *   validates this before returning to downstream consumers; a non-conforming
 *   backend surfaces as a `KreuzbergError::Validation`, not a panic.
 * - `embed` may be called from any thread. Its future must be `Send`
 *   (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
 * - `dimensions()` is called exactly once at registration, immediately after
 *   `initialize()` succeeds. The returned value is cached by the registry and
 *   used for all subsequent shape validation. Lazy-loading implementations can
 *   defer model loading into `initialize()` and report the real dimension
 *   afterwards. Later mutations of the backend's reported dimension are not
 *   observed by kreuzberg — implementations that need to change dimension
 *   must unregister and re-register.
 * - `shutdown()` (inherited from `Plugin`) may be invoked
 *   concurrently with an in-flight `embed()` call. Implementations must
 *   tolerate this — e.g. by letting in-flight calls finish using resources
 *   held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
 *   shared state that isn't needed by `embed`.
 *
 * # Runtime
 *
 * The synchronous `embed_texts` entry uses
 * [`tokio::task::block_in_place`] to await the trait's async `embed`, which
 * requires a multi-thread tokio runtime. Callers running inside a
 * `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
 * or `tokio::runtime::Builder::new_current_thread()`) must use
 * `embed_texts_async` instead, which awaits directly without
 * `block_in_place`.
 */
export interface EmbeddingBackend {
  name(): string
  /**
   * Embedding vector dimension. Must be `> 0` and must match the length of
   * every vector returned by `embed`.
   */
  dimensions(): string
  /**
   * Embed a batch of texts, returning one vector per input in order.
   * @throws Implementations should return `Plugin` for
   * backend-specific failures. The dispatcher layers its own validation
   * (length, per-vector dimension) on top.
   */
  embed(texts: Array<string>): Promise<string>
}

/**
 * Embedding configuration for text chunks.
 *
 * Configures embedding generation using ONNX models via the vendored embedding engine.
 * Requires the `embeddings` feature to be enabled.
 */
export interface EmbeddingConfig {
  /** The embedding model to use (defaults to "balanced" preset if not specified) */
  readonly model?: EmbeddingModelType
  /** Whether to normalize embedding vectors (recommended for cosine similarity) */
  readonly normalize?: boolean
  /** Batch size for embedding generation */
  readonly batchSize?: number
  /** Show model download progress */
  readonly showDownloadProgress?: boolean
  /**
   * Custom cache directory for model files
   *
   * Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
   * Allows full customization of model download location.
   */
  readonly cacheDir?: string
  /**
   * Hardware acceleration for the embedding ONNX model.
   *
   * When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
   * is used for inference. Defaults to `None` (auto-select per platform).
   */
  readonly acceleration?: AccelerationConfig
  /**
   * Maximum wall-clock duration (in seconds) for a single `embed()` call when
   * using [`EmbeddingModelType::Plugin`].
   *
   * Applies only to the in-process plugin path — protects against hung
   * host-language backends (e.g. a Python callback deadlocked on the GIL,
   * a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
   * returns `Plugin` instead of blocking forever.
   *
   * `None` disables the timeout. The default (60 seconds) is conservative
   * for common in-process inference; increase for large batches on slow
   * hardware.
   */
  readonly maxEmbedDurationSecs?: number
}

/** Embedding model types supported by Kreuzberg. */
export type EmbeddingModelType =
  | { type: 'preset'; name: string }
  | { type: 'custom'; modelId: string; dimensions: number }
  | { type: 'llm'; llm: LlmConfig }
  | { type: 'plugin'; name: string }

/**
 * Preset configurations for common RAG use cases.
 *
 * Each preset combines chunk size, overlap, and embedding model
 * to provide an optimized configuration for specific scenarios.
 *
 * All string fields are owned `String` for FFI compatibility — instances
 * are safe to clone and pass across language boundaries.
 */
export interface EmbeddingPreset {
  readonly name: string
  readonly chunkSize: number
  readonly overlap: number
  /** HuggingFace repository name for the model. */
  readonly modelRepo: string
  /** Pooling strategy: "cls" or "mean". */
  readonly pooling: string
  /** Path to the ONNX model file within the repo. */
  readonly modelFile: string
  readonly dimensions: number
  readonly description: string
}

/** EPUB metadata (Dublin Core extensions). */
export interface EpubMetadata {
  readonly coverage?: string
  readonly dcFormat?: string
  readonly relation?: string
  readonly source?: string
  readonly dcType?: string
  readonly coverImage?: string
}

/** Error metadata (for batch operations). */
export interface ErrorMetadata {
  readonly errorType: string
  readonly message: string
}

/**
 * Excel/spreadsheet format metadata.
 *
 * Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
 * discriminant. Sheet count and sheet names are stored inside this struct.
 */
export interface ExcelMetadata {
  /** Number of sheets in the workbook. */
  readonly sheetCount?: number
  /** Names of all sheets in the workbook. */
  readonly sheetNames?: Array<string>
}

/**
 * Single Excel worksheet.
 *
 * Represents one sheet from an Excel workbook with its content
 * converted to Markdown format and dimensional statistics.
 */
export interface ExcelSheet {
  /** Sheet name as it appears in Excel */
  readonly name: string
  /** Sheet content converted to Markdown tables */
  readonly markdown: string
  /** Number of rows */
  readonly rowCount: number
  /** Number of columns */
  readonly colCount: number
  /** Total number of non-empty cells */
  readonly cellCount: number
  /**
   * Pre-extracted table cells (2D vector of cell values)
   * Populated during markdown generation to avoid re-parsing markdown.
   * None for empty sheets.
   */
  readonly tableCells?: Array<Array<string>>
}

/**
 * Excel workbook representation.
 *
 * Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
 * extracted content and metadata.
 */
export interface ExcelWorkbook {
  /** All sheets in the workbook */
  readonly sheets: Array<ExcelSheet>
  /** Workbook-level metadata (author, creation date, etc.) */
  readonly metadata: Record<string, string>
  /**
   * Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
   *
   * Populated for legacy shared-workbook `.xlsx` files that contain the
   * `xl/revisions/` directory. Each `<header>` element maps to one
   * `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
   * (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
   * `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
   * follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
   */
  readonly revisions?: Array<DocumentRevision>
}

/**
 * ONNX Runtime execution provider type.
 *
 * Determines which hardware backend is used for model inference.
 * `Auto` (default) selects the best available provider per platform.
 */
export declare enum ExecutionProviderType {
  /** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
  Auto = "auto",
  /** CPU execution provider (always available). */
  Cpu = "cpu",
  /** Apple CoreML (macOS/iOS Neural Engine + GPU). */
  CoreMl = "coreml",
  /** NVIDIA CUDA GPU acceleration. */
  Cuda = "cuda",
  /** NVIDIA TensorRT (optimized CUDA inference). */
  TensorRt = "tensorrt",
}

/**
 * Extracted image from a document.
 *
 * Contains raw image data, metadata, and optional nested OCR results.
 * Raw bytes allow cross-language compatibility - users can convert to
 * PIL.Image (Python), Sharp (Node.js), or other formats as needed.
 */
export interface ExtractedImage {
  /**
   * Raw image data (PNG, JPEG, WebP, etc. bytes).
   * Uses `bytes::Bytes` for cheap cloning of large buffers.
   */
  readonly data: Uint8Array
  /**
   * Image format (e.g., "jpeg", "png", "webp")
   * Uses Cow<'static, str> to avoid allocation for static literals.
   */
  readonly format: string
  /** Zero-indexed position of this image in the document/page */
  readonly imageIndex: number
  /** Page/slide number where image was found (1-indexed) */
  readonly pageNumber?: number
  /** Image width in pixels */
  readonly width?: number
  /** Image height in pixels */
  readonly height?: number
  /** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
  readonly colorspace?: string
  /** Bits per color component (e.g., 8, 16) */
  readonly bitsPerComponent?: number
  /** Whether this image is a mask image */
  readonly isMask: boolean
  /** Optional description of the image */
  readonly description?: string
  /**
   * Nested OCR extraction result (if image was OCRed)
   *
   * When OCR is performed on this image, the result is embedded here
   * rather than in a separate collection, making the relationship explicit.
   */
  readonly ocrResult?: ExtractionResult
  /**
   * Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
   * Only populated for PDF-extracted images when position data is available from the PDF extractor.
   */
  readonly boundingBox?: BoundingBox
  /**
   * Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
   * Used for rendering image references when the binary data is not extracted.
   */
  readonly sourcePath?: string
  /**
   * Heuristic classification of what this image likely depicts.
   * `None` if classification was disabled or inconclusive.
   */
  readonly imageKind?: ImageKind
  /** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
  readonly kindConfidence?: number
  /**
   * Identifier shared across images that form a single logical figure
   * (e.g. all raster tiles of one technical drawing). `None` for singletons.
   */
  readonly clusterId?: number
}

/**
 * A URI extracted from a document.
 *
 * Represents any link, reference, or resource pointer found during extraction.
 * The `kind` field classifies the URI semantically, while `label` carries
 * optional human-readable display text.
 */
export interface ExtractedUri {
  /** The URL or path string. */
  readonly url: string
  /** Optional display text / label for the link. */
  readonly label?: string
  /** Optional page number where the URI was found (1-indexed). */
  readonly page?: number
  /** Semantic classification of the URI. */
  readonly kind: UriKind
}

/**
 * Main extraction configuration.
 *
 * This struct contains all configuration options for the extraction process.
 * It can be loaded from TOML, YAML, or JSON files, or created programmatically.
 */
export interface ExtractionConfig {
  /** Enable caching of extraction results */
  readonly useCache?: boolean
  /** Enable quality post-processing */
  readonly enableQualityProcessing?: boolean
  /** OCR configuration (None = OCR disabled) */
  readonly ocr?: OcrConfig
  /** Force OCR even for searchable PDFs */
  readonly forceOcr?: boolean
  /**
   * Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
   *
   * When set, only the listed pages are OCR'd regardless of text layer quality.
   * Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
   * Only applies to PDF documents. Duplicates are automatically deduplicated.
   * An `ocr` config is recommended for backend/language selection; defaults are used if absent.
   */
  readonly forceOcrPages?: Array<number>
  /**
   * Disable OCR entirely, even for images.
   *
   * When `true`, OCR is skipped for all document types. Images return metadata
   * only (dimensions, format, EXIF) without text extraction. PDFs use only
   * native text extraction without OCR fallback.
   *
   * Cannot be `true` simultaneously with `force_ocr`.
   *
   * *Added in v4.7.0.*
   */
  readonly disableOcr?: boolean
  /** Text chunking configuration (None = chunking disabled) */
  readonly chunking?: ChunkingConfig
  /**
   * Content filtering configuration (None = use extractor defaults).
   *
   * Controls whether document "furniture" (headers, footers, watermarks,
   * repeating text) is included in or stripped from extraction results.
   * See [`ContentFilterConfig`] for per-field documentation.
   */
  readonly contentFilter?: ContentFilterConfig
  /** Image extraction configuration (None = no image extraction) */
  readonly images?: ImageExtractionConfig
  /** PDF-specific options (None = use defaults) */
  readonly pdfOptions?: PdfConfig
  /** Token reduction configuration (None = no token reduction) */
  readonly tokenReduction?: TokenReductionOptions
  /** Language detection configuration (None = no language detection) */
  readonly languageDetection?: LanguageDetectionConfig
  /** Page extraction configuration (None = no page tracking) */
  readonly pages?: PageConfig
  /** Keyword extraction configuration (None = no keyword extraction) */
  readonly keywords?: KeywordConfig
  /** Post-processor configuration (None = use defaults) */
  readonly postprocessor?: PostProcessorConfig
  /**
   * HTML to Markdown conversion options (None = use defaults)
   *
   * Configure how HTML documents are converted to Markdown, including heading styles,
   * list formatting, code block styles, and preprocessing options.
   */
  readonly htmlOptions?: string
  /**
   * Styled HTML output configuration.
   *
   * When set alongside `output_format = OutputFormat::Html`, the extraction
   * pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
   * which emits stable `kb-*` CSS class hooks on every structural element
   * and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
   *
   * When `None`, the existing plain comrak-based HTML renderer is used.
   */
  readonly htmlOutput?: HtmlOutputConfig
  /**
   * Default per-file timeout in seconds for batch extraction.
   *
   * When set, each file in a batch will be canceled after this duration
   * unless overridden by [`FileExtractionConfig::timeout_secs`].
   *
   * Defaults to `Some(60)` to prevent pathological files (e.g. deeply
   * nested archives, documents with millions of cells) from running
   * indefinitely and exhausting caller resources. Set to `None` to
   * disable the timeout for trusted input or long-running workloads.
   */
  readonly extractionTimeoutSecs?: number
  /**
   * Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
   *
   * Limits parallelism to prevent resource exhaustion when processing
   * large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
   */
  readonly maxConcurrentExtractions?: number
  /**
   * Result structure format
   *
   * Controls whether results are returned in unified format (default) with all
   * content in the `content` field, or element-based format with semantic
   * elements (for Unstructured-compatible output).
   */
  readonly resultFormat?: ResultFormat
  /**
   * Security limits for archive extraction.
   *
   * Controls maximum archive size, compression ratio, file count, and other
   * security thresholds to prevent decompression bomb attacks. Also caps
   * nesting depth, iteration count, entity / token length, total
   * content size, and table cell count for every extraction path that
   * ingests user-controlled bytes.
   * When `None`, default limits are used.
   */
  readonly securityLimits?: SecurityLimits
  /**
   * Maximum uncompressed size in bytes for a single embedded file before
   * recursive extraction is attempted (default: 50 MiB).
   *
   * Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
   * to email attachments processed via recursive extraction. Files that
   * exceed this limit are skipped with a `ProcessingWarning` rather than
   * passed to the extraction pipeline, preventing a single oversized
   * embedded object from consuming unbounded memory or time.
   *
   * Set to `None` to disable the per-embedded-file cap (falls back to
   * `security_limits.max_archive_size` as the only guard).
   */
  readonly maxEmbeddedFileBytes?: number
  /**
   * Content text format (default: Plain).
   *
   * Controls the format of the extracted content:
   * - `Plain`: Raw extracted text (default)
   * - `Markdown`: Markdown formatted output
   * - `Djot`: Djot markup format (requires djot feature)
   * - `Html`: HTML formatted output
   *
   * When set to a structured format, extraction results will include
   * formatted output. The `formatted_content` field may be populated
   * when format conversion is applied.
   */
  readonly outputFormat?: OutputFormat
  /**
   * Layout detection configuration (None = layout detection disabled).
   *
   * When set, PDF pages and images are analyzed for document structure
   * (headings, code, formulas, tables, figures, etc.) using RT-DETR models
   * via ONNX Runtime. For PDFs, layout hints override paragraph classification
   * in the markdown pipeline. For images, per-region OCR is performed with
   * markdown formatting based on detected layout classes.
   * Requires the `layout-detection` feature to run inference; the field is
   * present whenever the `layout-types` feature is active (which includes
   * `layout-detection` as well as the no-ORT target groups).
   */
  readonly layout?: LayoutDetectionConfig
  /**
   * Run layout detection on the non-OCR PDF markdown path.
   *
   * When `true` and `layout` is `Some(_)`, layout regions inform heading,
   * table, list, and figure detection in the structure pipeline that would
   * otherwise rely on font-clustering heuristics alone. Significantly
   * improves SF1 (structural F1) at the cost of inference latency
   * (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
   * Requires the `layout-detection` feature.
   */
  readonly useLayoutForMarkdown?: boolean
  /**
   * Enable structured document tree output.
   *
   * When true, populates the `document` field on `ExtractionResult` with a
   * hierarchical `DocumentStructure` containing heading-driven section nesting,
   * table grids, content layer classification, and inline annotations.
   *
   * Independent of `result_format` — can be combined with Unified or ElementBased.
   */
  readonly includeDocumentStructure?: boolean
  /**
   * Hardware acceleration configuration for ONNX Runtime models.
   *
   * Controls execution provider selection for layout detection and embedding
   * models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
   * Linux, CPU on Windows).
   */
  readonly acceleration?: AccelerationConfig
  /**
   * Cache namespace for tenant isolation.
   *
   * When set, cache entries are stored under `{cache_dir}/{namespace}/`.
   * Must be alphanumeric, hyphens, or underscores only (max 64 chars).
   * Different namespaces have isolated cache spaces on the same filesystem.
   */
  readonly cacheNamespace?: string
  /**
   * Per-request cache TTL in seconds.
   *
   * Overrides the global `max_age_days` for this specific extraction.
   * When `0`, caching is completely skipped (no read or write).
   * When `None`, the global TTL applies.
   */
  readonly cacheTtlSecs?: number
  /**
   * Email extraction configuration (None = use defaults).
   *
   * Currently supports configuring the fallback codepage for MSG files
   * that do not specify one. See `EmailConfig` for details.
   */
  readonly email?: EmailConfig
  /**
   * Concurrency limits for constrained environments (None = use defaults).
   *
   * Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
   * (when `max_concurrent_extractions` is unset) the batch concurrency
   * semaphore. See `ConcurrencyConfig` for details.
   */
  readonly concurrency?: string
  /**
   * Maximum recursion depth for archive extraction (default: 3).
   * Set to 0 to disable recursive extraction (legacy behavior).
   */
  readonly maxArchiveDepth?: number
  /**
   * Tree-sitter language pack configuration (None = tree-sitter disabled).
   *
   * When set, enables code file extraction using tree-sitter parsers.
   * Controls grammar download behavior and code analysis options.
   */
  readonly treeSitter?: TreeSitterConfig
  /**
   * Structured extraction via LLM (None = disabled).
   *
   * When set, the extracted document content is sent to an LLM with the
   * provided JSON schema. The structured response is stored in
   * `ExtractionResult::structured_output`.
   */
  readonly structuredExtraction?: StructuredExtractionConfig
  /**
   * Cancellation token for this extraction (None = no external cancellation).
   *
   * Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
   * from another thread / task to abort the extraction in progress. The extractor
   * checks the token at safe checkpoints (before lock acquisition, between pages,
   * between batch items) and returns [`KreuzbergError::Cancelled`] when set.
   *
   * The field is excluded from serialization because `CancellationToken` is a
   * runtime handle, not a configuration value.
   */
  readonly cancelToken?: string
}

/** The complete diff between two `ExtractionResult` values. */
export interface ExtractionDiff {
  /**
   * Unified-diff hunks for the `content` field.
   *
   * Empty when the content is identical.
   */
  readonly contentDiff: Array<DiffHunk>
  /** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
  readonly tablesAdded: Array<Table>
  /** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
  readonly tablesRemoved: Array<Table>
  /** Cell-level changes for table pairs that share the same index and dimensions. */
  readonly tablesChanged: Array<TableDiff>
  /**
   * Metadata difference, encoded as a JSON object with three top-level keys:
   * `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
   * but not `b`), and `changed` (keys whose values differ — each entry is
   * `{ "from": <value-in-a>, "to": <value-in-b> }`).
   *
   * This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
   * to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
   * (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
   * preferred json-patch impl directly.
   */
  readonly metadataChanged: JsonValue
  /** Changes to embedded archive children. */
  readonly embeddedChanges: EmbeddedChanges
}

/** How the extracted text was produced. */
export declare enum ExtractionMethod {
  Native = "native",
  Ocr = "ocr",
  Mixed = "mixed",
}

/**
 * General extraction result used by the core extraction API.
 *
 * This is the main result type returned by all extraction functions.
 */
export interface ExtractionResult {
  readonly content?: string
  readonly mimeType?: string
  readonly metadata?: Metadata
  /**
   * Extraction strategy used to produce the returned text.
   *
   * Populated when the extractor can reliably distinguish native text extraction,
   * OCR-only extraction, or mixed native/OCR output.
   */
  readonly extractionMethod?: ExtractionMethod
  readonly tables?: Array<Table>
  readonly detectedLanguages?: Array<string>
  /**
   * Text chunks when chunking is enabled.
   *
   * When chunking configuration is provided, the content is split into
   * overlapping chunks for efficient processing. Each chunk contains the text,
   * optional embeddings (if enabled), and metadata about its position.
   */
  readonly chunks?: Array<Chunk>
  /**
   * Extracted images from the document.
   *
   * When image extraction is enabled via `ImageExtractionConfig`, this field
   * contains all images found in the document with their raw data and metadata.
   * Each image may optionally contain a nested `ocr_result` if OCR was performed.
   */
  readonly images?: Array<ExtractedImage>
  /**
   * Per-page content when page extraction is enabled.
   *
   * When page extraction is configured, the document is split into per-page content
   * with tables and images mapped to their respective pages.
   */
  readonly pages?: Array<PageContent>
  /**
   * Semantic elements when element-based result format is enabled.
   *
   * When result_format is set to ElementBased, this field contains semantic
   * elements with type classification, unique identifiers, and metadata for
   * Unstructured-compatible element-based processing.
   */
  readonly elements?: Array<Element>
  /**
   * Rich Djot content structure (when extracting Djot documents).
   *
   * When extracting Djot documents with structured extraction enabled,
   * this field contains the full semantic structure including:
   * - Block-level elements with nesting
   * - Inline formatting with attributes
   * - Links, images, footnotes
   * - Math expressions
   * - Complete attribute information
   *
   * The `content` field still contains plain text for backward compatibility.
   *
   * Always `None` for non-Djot documents.
   */
  readonly djotContent?: DjotContent
  /**
   * OCR elements with full spatial and confidence metadata.
   *
   * When OCR is performed with element extraction enabled, this field contains
   * the structured representation of detected text including:
   * - Bounding geometry (rectangles or quadrilaterals)
   * - Confidence scores (detection and recognition)
   * - Rotation information
   * - Hierarchical relationships (Tesseract only)
   *
   * This field preserves all metadata that would otherwise be lost when
   * converting to plain text or markdown output formats.
   *
   * Only populated when `OcrElementConfig.include_elements` is true.
   */
  readonly ocrElements?: Array<OcrElement>
  /**
   * Structured document tree (when document structure extraction is enabled).
   *
   * When `include_document_structure` is true in `ExtractionConfig`, this field
   * contains the full hierarchical representation of the document including:
   * - Heading-driven section nesting
   * - Table grids with cell-level metadata
   * - Content layer classification (body, header, footer, footnote)
   * - Inline text annotations (formatting, links)
   * - Bounding boxes and page numbers
   *
   * Independent of `result_format` — can be combined with Unified or ElementBased.
   */
  readonly document?: DocumentStructure
  /**
   * Extracted keywords when keyword extraction is enabled.
   *
   * When keyword extraction (RAKE or YAKE) is configured, this field contains
   * the extracted keywords with scores, algorithm info, and position data.
   * Previously stored in `metadata.additional["keywords"]`.
   */
  readonly extractedKeywords?: Array<Keyword>
  /**
   * Document quality score from quality analysis.
   *
   * A value between 0.0 and 1.0 indicating the overall text quality.
   * Previously stored in `metadata.additional["quality_score"]`.
   */
  readonly qualityScore?: number
  /**
   * Non-fatal warnings collected during processing pipeline stages.
   *
   * Captures errors from optional pipeline features (embedding, chunking,
   * language detection, output formatting) that don't prevent extraction
   * but may indicate degraded results.
   * Previously stored as individual keys in `metadata.additional`.
   */
  readonly processingWarnings?: Array<ProcessingWarning>
  /**
   * PDF annotations extracted from the document.
   *
   * When annotation extraction is enabled via `PdfConfig::extract_annotations`,
   * this field contains text notes, highlights, links, stamps, and other
   * annotations found in PDF documents.
   */
  readonly annotations?: Array<PdfAnnotation>
  /**
   * Nested extraction results from archive contents.
   *
   * When extracting archives, each processable file inside produces its own
   * full extraction result. Set to `None` for non-archive formats.
   * Use `max_archive_depth` in config to control recursion depth.
   */
  readonly children?: Array<ArchiveEntry>
  /**
   * URIs/links discovered during document extraction.
   *
   * Contains hyperlinks, image references, citations, email addresses, and
   * other URI-like references found in the document. Always extracted when
   * present in the source document.
   */
  readonly uris?: Array<ExtractedUri>
  /**
   * Tracked changes embedded in the source document.
   *
   * Populated by per-format extractors that understand change-tracking
   * metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
   * …). Every extractor defaults to `None` until its format-specific
   * implementation is added. Extractors that do populate this field follow
   * the "accepted-changes" convention: inserted text is present in
   * `content`, deleted text is absent — the revision list is the separate
   * audit trail.
   */
  readonly revisions?: Array<DocumentRevision>
  /**
   * Structured extraction output from LLM-based JSON schema extraction.
   *
   * When `structured_extraction` is configured in `ExtractionConfig`, the
   * extracted document content is sent to a VLM with the provided JSON schema.
   * The response is parsed and stored here as a JSON value matching the schema.
   */
  readonly structuredOutput?: JsonValue
  /**
   * Code intelligence results from tree-sitter analysis.
   *
   * Populated when extracting source code files with the `tree-sitter` feature.
   * Contains metrics, structural analysis, imports/exports, comments,
   * docstrings, symbols, diagnostics, and optionally chunked code segments.
   *
   * Stored as an opaque JSON value so that all language bindings (Go, Java,
   * C#, …) can deserialize it as a raw JSON object rather than a typed struct.
   * The underlying type is `tree_sitter_language_pack::ProcessResult`.
   */
  readonly codeIntelligence?: JsonValue
  /**
   * LLM token usage and cost data for all LLM calls made during this extraction.
   *
   * Contains one entry per LLM call. Multiple entries are produced when
   * VLM OCR, structured extraction, or LLM embeddings run during
   * the same extraction.
   *
   * `None` when no LLM was used.
   */
  readonly llmUsage?: Array<LlmUsage>
  /**
   * Pre-rendered content in the requested output format.
   *
   * Populated during `derive_extraction_result` before tree derivation consumes
   * element data. `apply_output_format` swaps this into `content` at the end
   * of the pipeline, after post-processors have operated on plain text.
   */
  readonly formattedContent?: string
  /**
   * Structured hOCR document for the OCR+layout pipeline.
   *
   * When tesseract produces hOCR output, the parsed `InternalDocument` carries
   * paragraph structure with bounding boxes and confidence scores. The layout
   * classification step enriches these elements before final rendering.
   */
  readonly ocrInternalDocument?: string
}

/** FictionBook (FB2) metadata. */
export interface FictionBookMetadata {
  readonly genres?: Array<string>
  readonly sequences?: Array<string>
  readonly annotation?: string
}

/**
 * Per-file extraction configuration overrides for batch processing.
 *
 * All fields are `Option<T>` — `None` means "use the batch-level default."
 * This type is used with `batch_extract_files` and
 * `batch_extract_bytes` to allow heterogeneous
 * extraction settings within a single batch.
 *
 * # Excluded Fields
 *
 * The following `ExtractionConfig` fields are batch-level only and
 * cannot be overridden per file:
 * - `max_concurrent_extractions` — controls batch parallelism
 * - `use_cache` — global caching policy
 * - `acceleration` — shared ONNX execution provider
 * - `security_limits` — global archive security policy
 */
export interface FileExtractionConfig {
  /** Override quality post-processing for this file. */
  readonly enableQualityProcessing?: boolean
  /** Override OCR configuration for this file (None in the Option = use batch default). */
  readonly ocr?: OcrConfig
  /** Override force OCR for this file. */
  readonly forceOcr?: boolean
  /** Override force OCR pages for this file (1-indexed page numbers). */
  readonly forceOcrPages?: Array<number>
  /** Override disable OCR for this file. */
  readonly disableOcr?: boolean
  /** Override chunking configuration for this file. */
  readonly chunking?: ChunkingConfig
  /** Override content filtering configuration for this file. */
  readonly contentFilter?: ContentFilterConfig
  /** Override image extraction configuration for this file. */
  readonly images?: ImageExtractionConfig
  /** Override PDF options for this file. */
  readonly pdfOptions?: PdfConfig
  /** Override token reduction for this file. */
  readonly tokenReduction?: TokenReductionOptions
  /** Override language detection for this file. */
  readonly languageDetection?: LanguageDetectionConfig
  /** Override page extraction for this file. */
  readonly pages?: PageConfig
  /** Override keyword extraction for this file. */
  readonly keywords?: KeywordConfig
  /** Override post-processor for this file. */
  readonly postprocessor?: PostProcessorConfig
  /** Override HTML conversion options for this file. */
  readonly htmlOptions?: string
  /** Override result format for this file. */
  readonly resultFormat?: ResultFormat
  /** Override output content format for this file. */
  readonly outputFormat?: OutputFormat
  /** Override document structure output for this file. */
  readonly includeDocumentStructure?: boolean
  /** Override layout detection for this file. */
  readonly layout?: LayoutDetectionConfig
  /**
   * Override per-file extraction timeout in seconds.
   *
   * When set, the extraction for this file will be canceled after the
   * specified duration. A timed-out file produces an error result without
   * affecting other files in the batch.
   */
  readonly timeoutSecs?: number
  /** Override tree-sitter configuration for this file. */
  readonly treeSitter?: TreeSitterConfig
  /**
   * Override structured extraction configuration for this file.
   *
   * When set, enables LLM-based structured extraction with a JSON schema
   * for this specific file. The extracted content is sent to a VLM/LLM
   * and the response is parsed according to the provided schema.
   */
  readonly structuredExtraction?: StructuredExtractionConfig
}

/** Footnote in Djot. */
export interface Footnote {
  /** Footnote label */
  readonly label: string
  /** Footnote content blocks */
  readonly content: Array<FormattedBlock>
}

/**
 * Format-specific metadata (discriminated union).
 *
 * Only one format type can exist per extraction result. This provides
 * type-safe, clean metadata without nested optionals.
 */
export type FormatMetadata =
  | { format_type: 'pdf'; 0: PdfMetadata }
  | { format_type: 'docx'; 0: DocxMetadata }
  | { format_type: 'excel'; 0: ExcelMetadata }
  | { format_type: 'email'; 0: EmailMetadata }
  | { format_type: 'pptx'; 0: PptxMetadata }
  | { format_type: 'archive'; 0: ArchiveMetadata }
  | { format_type: 'image'; 0: ImageMetadata }
  | { format_type: 'xml'; 0: XmlMetadata }
  | { format_type: 'text'; 0: TextMetadata }
  | { format_type: 'html'; 0: HtmlMetadata }
  | { format_type: 'ocr'; 0: OcrMetadata }
  | { format_type: 'csv'; 0: CsvMetadata }
  | { format_type: 'bibtex'; 0: BibtexMetadata }
  | { format_type: 'citation'; 0: CitationMetadata }
  | { format_type: 'fiction_book'; 0: FictionBookMetadata }
  | { format_type: 'dbf'; 0: DbfMetadata }
  | { format_type: 'jats'; 0: JatsMetadata }
  | { format_type: 'epub'; 0: EpubMetadata }
  | { format_type: 'pst'; 0: PstMetadata }
  | { format_type: 'code'; 0: string }

/**
 * Block-level element in a Djot document.
 *
 * Represents structural elements like headings, paragraphs, lists, code blocks, etc.
 */
export interface FormattedBlock {
  /** Type of block element */
  readonly blockType: BlockType
  /** Heading level (1-6) for headings, or nesting level for lists */
  readonly level?: number
  /** Inline content within the block */
  readonly inlineContent: Array<InlineElement>
  /** Element attributes (classes, IDs, key-value pairs) */
  readonly attributes?: string
  /** Language identifier for code blocks */
  readonly language?: string
  /** Raw code content for code blocks */
  readonly code?: string
  /** Nested blocks for containers (blockquotes, list items, divs) */
  readonly children: Array<FormattedBlock>
}

/** Individual grid cell with position and span metadata. */
export interface GridCell {
  /** Cell text content. */
  readonly content: string
  /** Zero-indexed row position. */
  readonly row: number
  /** Zero-indexed column position. */
  readonly col: number
  /** Number of rows this cell spans. */
  readonly rowSpan: number
  /** Number of columns this cell spans. */
  readonly colSpan: number
  /** Whether this is a header cell. */
  readonly isHeader: boolean
  /** Bounding box for this cell (if available). */
  readonly bbox?: BoundingBox
}

/** Header/heading element metadata. */
export interface HeaderMetadata {
  /** Header level: 1 (h1) through 6 (h6) */
  readonly level: number
  /** Normalized text content of the header */
  readonly text: string
  /** HTML id attribute if present */
  readonly id?: string
  /** Document tree depth at the header element */
  readonly depth: number
  /** Byte offset in original HTML document */
  readonly htmlOffset: number
}

/**
 * Heading context for a chunk within a Markdown document.
 *
 * Contains the heading hierarchy from document root to this chunk's section.
 */
export interface HeadingContext {
  /**
   * The heading hierarchy from document root to this chunk's section.
   * Index 0 is the outermost (h1), last element is the most specific.
   */
  readonly headings: Array<HeadingLevel>
}

/** A single heading in the hierarchy. */
export interface HeadingLevel {
  /** Heading depth (1 = h1, 2 = h2, etc.) */
  readonly level: number
  /** The text content of the heading. */
  readonly text: string
}

/**
 * A text block with hierarchy level assignment.
 *
 * Represents a block of text with semantic heading information extracted from
 * font size clustering and hierarchical analysis.
 */
export interface HierarchicalBlock {
  /** The text content of this block */
  readonly text: string
  /** The font size of the text in this block */
  readonly fontSize: number
  /**
   * The hierarchy level of this block (H1-H6 or Body)
   *
   * Levels correspond to HTML heading tags:
   * - "h1": Top-level heading
   * - "h2": Secondary heading
   * - "h3": Tertiary heading
   * - "h4": Quaternary heading
   * - "h5": Quinary heading
   * - "h6": Senary heading
   * - "body": Body text (no heading level)
   */
  readonly level: string
  /**
   * Bounding box information for the block
   *
   * Contains coordinates as (left, top, right, bottom) in PDF units.
   */
  readonly bbox?: Array<number>
}

/**
 * Hierarchy extraction configuration for PDF text structure analysis.
 *
 * Enables extraction of document hierarchy levels (H1-H6) based on font size
 * clustering and semantic analysis. When enabled, hierarchical blocks are
 * included in page content.
 */
export interface HierarchyConfig {
  /** Enable hierarchy extraction */
  readonly enabled?: boolean
  /**
   * Number of font size clusters to use for hierarchy levels (1-7)
   *
   * Default: 6, which provides H1-H6 heading levels with body text.
   * Larger values create more fine-grained hierarchy levels.
   */
  readonly kClusters?: number
  /** Include bounding box information in hierarchy blocks */
  readonly includeBbox?: boolean
  /**
   * OCR coverage threshold for smart OCR triggering (0.0-1.0)
   *
   * Determines when OCR should be triggered based on text block coverage.
   * OCR is triggered when text blocks cover less than this fraction of the page.
   * Default: 0.5 (trigger OCR if less than 50% of page has text)
   */
  readonly ocrCoverageThreshold?: number
}

/**
 * HTML metadata extracted from HTML documents.
 *
 * Includes document-level metadata, Open Graph data, Twitter Card metadata,
 * and extracted structural elements (headers, links, images, structured data).
 */
export interface HtmlMetadata {
  /** Document title from `<title>` tag */
  readonly title?: string
  /** Document description from `<meta name="description">` tag */
  readonly description?: string
  /** Document keywords from `<meta name="keywords">` tag, split on commas */
  readonly keywords?: Array<string>
  /** Document author from `<meta name="author">` tag */
  readonly author?: string
  /** Canonical URL from `<link rel="canonical">` tag */
  readonly canonicalUrl?: string
  /** Base URL from `<base href="">` tag for resolving relative URLs */
  readonly baseHref?: string
  /** Document language from `lang` attribute */
  readonly language?: string
  /** Document text direction from `dir` attribute */
  readonly textDirection?: TextDirection
  /**
   * Open Graph metadata (og:* properties) for social media
   * Keys like "title", "description", "image", "url", etc.
   */
  readonly openGraph?: Record<string, string>
  /**
   * Twitter Card metadata (twitter:* properties)
   * Keys like "card", "site", "creator", "title", "description", "image", etc.
   */
  readonly twitterCard?: Record<string, string>
  /**
   * Additional meta tags not covered by specific fields
   * Keys are meta name/property attributes, values are content
   */
  readonly metaTags?: Record<string, string>
  /** Extracted header elements with hierarchy */
  readonly headers?: Array<HeaderMetadata>
  /** Extracted hyperlinks with type classification */
  readonly links?: Array<LinkMetadata>
  /** Extracted images with source and dimensions */
  readonly images?: Array<ImageMetadataType>
  /** Extracted structured data blocks */
  readonly structuredData?: Array<StructuredData>
}

/**
 * Configuration for styled HTML output.
 *
 * When set on [`ExtractionConfig::html_output`] alongside
 * `output_format = OutputFormat::Html`, the pipeline builds a
 * [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
 * the plain comrak-based renderer.
 */
export interface HtmlOutputConfig {
  /**
   * Inline CSS string injected into the output after the theme stylesheet.
   * Concatenated after `css_file` content when both are set.
   */
  readonly css?: string
  /**
   * Path to a CSS file loaded once at renderer construction time.
   * Concatenated before `css` when both are set.
   */
  readonly cssFile?: string
  /** Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
  readonly theme?: HtmlTheme
  /**
   * CSS class prefix applied to every emitted class name.
   *
   * Default: `"kb-"`. Change this if your host application already uses
   * classes that start with `kb-`.
   */
  readonly classPrefix?: string
  /**
   * When `true` (default), write the resolved CSS into a `<style>` block
   * immediately after the opening `<div class="{prefix}doc">`.
   *
   * Set to `false` to emit only the structural markup and wire up your
   * own stylesheet targeting the `kb-*` class names.
   */
  readonly embedCss?: boolean
}

/** Built-in HTML theme selection. */
export declare enum HtmlTheme {
  /**
   * Sensible defaults: system font stack, neutral colours, readable line
   * measure. CSS custom properties (`--kb-*`) are all defined so user CSS
   * can override individual values.
   */
  Default = "default",
  /** GitHub Markdown-inspired palette and spacing. */
  GitHub = "github",
  /** Dark background, light text. */
  Dark = "dark",
  /** Minimal light theme with generous whitespace. */
  Light = "light",
  /**
   * No built-in stylesheet emitted. CSS custom properties are still defined
   * on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
   */
  Unstyled = "unstyled",
}

/** Image extraction configuration. */
export interface ImageExtractionConfig {
  /** Extract images from documents */
  readonly extractImages?: boolean
  /** Target DPI for image normalization */
  readonly targetDpi?: number
  /** Maximum dimension for images (width or height) */
  readonly maxImageDimension?: number
  /**
   * Whether to inject image reference placeholders into markdown output.
   * When `true` (default), image references like `![Image 1](embedded:p1_i0)`
   * are appended to the markdown. Set to `false` to extract images as data
   * without polluting the markdown output.
   */
  readonly injectPlaceholders?: boolean
  /** Automatically adjust DPI based on image content */
  readonly autoAdjustDpi?: boolean
  /** Minimum DPI threshold */
  readonly minDpi?: number
  /** Maximum DPI threshold */
  readonly maxDpi?: number
  /**
   * Maximum number of image objects to extract per PDF page.
   *
   * Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
   * can trigger extremely long or indefinite extraction times when every image
   * object on a dense page is decoded individually via the PDF extractor. Setting this
   * limit causes kreuzberg to stop collecting individual images once the count
   * per page reaches the cap and emit a warning instead.
   *
   * `None` (default) means no limit — all images are extracted.
   */
  readonly maxImagesPerPage?: number
  /**
   * When `true` (default), extracted images are classified by kind and grouped
   * into clusters where they appear to belong to one figure.
   */
  readonly classify?: boolean
  /**
   * When `true`, full-page renders produced during OCR preprocessing are captured
   * and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
   *
   * **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
   * document-level OCR bypass is active (whole-document backend). When OCR is
   * enabled and this flag is set but the active backend skips per-page rendering,
   * a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
   *
   * Defaults to `false`. Enable when downstream consumers need page thumbnails
   * (e.g. citation previews, visual grounding).
   */
  readonly includePageRasters?: boolean
  /**
   * Run OCR on extracted images and include the recognized text in the document content.
   *
   * When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
   * are processed with the configured OCR backend. Set to `false` to extract images
   * without OCR processing, even when OCR is enabled.
   */
  readonly runOcrOnImages?: boolean
  /**
   * When `true`, image OCR results are rendered as plain text without the
   * `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
   * is also `true`.
   */
  readonly ocrTextOnly?: boolean
  /**
   * When `true` and `ocr_text_only` is `false`, append the OCR text after
   * the image placeholder in the rendered output.
   */
  readonly appendOcrText?: boolean
}

/** Heuristic classification of what an image likely depicts. */
export declare enum ImageKind {
  /** Photographic image (natural scene, photograph) */
  Photograph = "photograph",
  /** Technical or schematic diagram */
  Diagram = "diagram",
  /** Chart, graph, or plot */
  Chart = "chart",
  /** Freehand or technical drawing */
  Drawing = "drawing",
  /** Text-heavy image (scanned text, document) */
  TextBlock = "text_block",
  /** Decorative element or border */
  Decoration = "decoration",
  /** Logo or brand mark */
  Logo = "logo",
  /** Small icon */
  Icon = "icon",
  /** Fragment of a larger tiled image (tile of a technical drawing) */
  TileFragment = "tile_fragment",
  /** Mask or transparency map */
  Mask = "mask",
  /** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
  PageRaster = "page_raster",
  /** Could not classify with reasonable confidence */
  Unknown = "unknown",
}

/**
 * Image metadata extracted from image files.
 *
 * Includes dimensions, format, and EXIF data.
 */
export interface ImageMetadata {
  /** Image width in pixels */
  readonly width?: number
  /** Image height in pixels */
  readonly height?: number
  /** Image format (e.g., "PNG", "JPEG", "TIFF") */
  readonly format?: string
  /** EXIF metadata tags */
  readonly exif?: Record<string, string>
}

/** Image element metadata. */
export interface ImageMetadataType {
  /** Image source (URL, data URI, or SVG content) */
  readonly src: string
  /** Alternative text from alt attribute */
  readonly alt?: string
  /** Title attribute */
  readonly title?: string
  /** Image dimensions as (width, height) if available */
  readonly dimensions?: Array<number>
  /** Image type classification */
  readonly imageType: ImageType
  /** Additional attributes as key-value pairs */
  readonly attributes: Array<Array<string>>
}

/**
 * Image preprocessing configuration for OCR.
 *
 * These settings control how images are preprocessed before OCR to improve
 * text recognition quality. Different preprocessing strategies work better
 * for different document types.
 */
export interface ImagePreprocessingConfig {
  /** Target DPI for the image (300 is standard, 600 for small text). */
  readonly targetDpi?: number
  /** Auto-detect and correct image rotation. */
  readonly autoRotate?: boolean
  /** Correct skew (tilted images). */
  readonly deskew?: boolean
  /** Remove noise from the image. */
  readonly denoise?: boolean
  /** Enhance contrast for better text visibility. */
  readonly contrastEnhance?: boolean
  /** Binarization method: "otsu", "sauvola", "adaptive". */
  readonly binarizationMethod?: string
  /** Invert colors (white text on black → black on white). */
  readonly invertColors?: boolean
}

/**
 * Image preprocessing metadata.
 *
 * Tracks the transformations applied to an image during OCR preprocessing,
 * including DPI normalization, resizing, and resampling.
 */
export interface ImagePreprocessingMetadata {
  /** Original image dimensions (width, height) in pixels */
  readonly originalDimensions: Array<number>
  /** Original image DPI (horizontal, vertical) */
  readonly originalDpi: Array<number>
  /** Target DPI from configuration */
  readonly targetDpi: number
  /** Scaling factor applied to the image */
  readonly scaleFactor: number
  /** Whether DPI was auto-adjusted based on content */
  readonly autoAdjusted: boolean
  /** Final DPI after processing */
  readonly finalDpi: number
  /** New dimensions after resizing (if resized) */
  readonly newDimensions?: Array<number>
  /** Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
  readonly resampleMethod: string
  /** Whether dimensions were clamped to max_image_dimension */
  readonly dimensionClamped: boolean
  /** Calculated optimal DPI (if auto_adjust_dpi enabled) */
  readonly calculatedDpi?: number
  /** Whether resize was skipped (dimensions already optimal) */
  readonly skippedResize: boolean
  /** Error message if resize failed */
  readonly resizeError?: string
}

/** Image type classification. */
export declare enum ImageType {
  /** Data URI image */
  DataUri = "data-uri",
  /** Inline SVG */
  InlineSvg = "inline-svg",
  /** External image URL */
  External = "external",
  /** Relative path image */
  Relative = "relative",
}

/**
 * Inline element within a block.
 *
 * Represents text with formatting, links, images, etc.
 */
export interface InlineElement {
  /** Type of inline element */
  readonly elementType: InlineType
  /** Text content */
  readonly content: string
  /** Element attributes */
  readonly attributes?: string
  /** Additional metadata (e.g., href for links, src/alt for images) */
  readonly metadata?: Record<string, string>
}

/** Types of inline elements in Djot. */
export declare enum InlineType {
  Text = "text",
  Strong = "strong",
  Emphasis = "emphasis",
  Highlight = "highlight",
  Subscript = "subscript",
  Superscript = "superscript",
  Insert = "insert",
  Delete = "delete",
  Code = "code",
  Link = "link",
  Image = "image",
  Span = "span",
  Math = "math",
  RawInline = "raw_inline",
  FootnoteRef = "footnote_ref",
  Symbol = "symbol",
}

/** JATS (Journal Article Tag Suite) metadata. */
export interface JatsMetadata {
  readonly copyright?: string
  readonly license?: string
  readonly historyDates?: Record<string, string>
  readonly contributorRoles?: Array<ContributorRole>
}

/** Extracted keyword with metadata. */
export interface Keyword {
  /** The keyword text. */
  readonly text: string
  /** Relevance score (higher is better, algorithm-specific range). */
  readonly score: number
  /** Algorithm that extracted this keyword. */
  readonly algorithm: KeywordAlgorithm
  /** Optional positions where keyword appears in text (character offsets). */
  readonly positions?: Array<number>
}

/** Keyword algorithm selection. */
export declare enum KeywordAlgorithm {
  /** YAKE (Yet Another Keyword Extractor) - statistical approach */
  Yake = "yake",
  /** RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based */
  Rake = "rake",
}

/** Keyword extraction configuration. */
export interface KeywordConfig {
  /** Algorithm to use for extraction. */
  readonly algorithm?: KeywordAlgorithm
  /** Maximum number of keywords to extract (default: 10). */
  readonly maxKeywords?: number
  /**
   * Minimum score threshold (0.0-1.0, default: 0.0).
   *
   * Keywords with scores below this threshold are filtered out.
   * Note: Score ranges differ between algorithms.
   */
  readonly minScore?: number
  /**
   * N-gram range for keyword extraction (min, max).
   *
   * (1, 1) = unigrams only
   * (1, 2) = unigrams and bigrams
   * (1, 3) = unigrams, bigrams, and trigrams (default)
   */
  readonly ngramRange?: Array<number>
  /**
   * Language code for stopword filtering (e.g., "en", "de", "fr").
   *
   * If None, no stopword filtering is applied.
   */
  readonly language?: string
  /** YAKE-specific tuning parameters. */
  readonly yakeParams?: YakeParams
  /** RAKE-specific tuning parameters. */
  readonly rakeParams?: RakeParams
}

/** Language detection configuration. */
export interface LanguageDetectionConfig {
  /** Enable language detection */
  readonly enabled?: boolean
  /** Minimum confidence threshold (0.0-1.0) */
  readonly minConfidence?: number
  /** Detect multiple languages in the document */
  readonly detectMultiple?: boolean
}

/**
 * The 17 canonical document layout classes.
 *
 * All model backends (RT-DETR, YOLO, etc.) map their native class IDs
 * to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
 * map to the closest equivalent.
 *
 * Wire format is snake_case in all serializers (JSON, TOML, YAML).
 */
export declare enum LayoutClass {
  Caption = "caption",
  Footnote = "footnote",
  Formula = "formula",
  ListItem = "list_item",
  PageFooter = "page_footer",
  PageHeader = "page_header",
  Picture = "picture",
  SectionHeader = "section_header",
  Table = "table",
  Text = "text",
  Title = "title",
  DocumentIndex = "document_index",
  Code = "code",
  CheckboxSelected = "checkbox_selected",
  CheckboxUnselected = "checkbox_unselected",
  Form = "form",
  KeyValueRegion = "key_value_region",
}

/** A single layout detection result. */
export interface LayoutDetection {
  readonly className: LayoutClass
  readonly confidence: number
  readonly bbox: BBox
}

/**
 * Layout detection configuration.
 *
 * Controls layout detection behavior in the extraction pipeline.
 * When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
 * is enabled for PDF extraction.
 */
export interface LayoutDetectionConfig {
  /** Confidence threshold override (None = use model default). */
  readonly confidenceThreshold?: number
  /** Whether to apply postprocessing heuristics (default: true). */
  readonly applyHeuristics?: boolean
  /**
   * Table structure recognition model.
   *
   * Controls which model is used for table cell detection within layout-detected
   * table regions. Defaults to [`TableModel::Tatr`].
   */
  readonly tableModel?: TableModel
  /**
   * Hardware acceleration for ONNX models (layout detection + table structure).
   *
   * When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
   * is used for inference. Defaults to `None` (auto-select per platform).
   */
  readonly acceleration?: AccelerationConfig
}

/**
 * A detected layout region on a page.
 *
 * When layout detection is enabled, each page may have layout regions
 * identifying different content types (text, pictures, tables, etc.)
 * with confidence scores and spatial positions.
 */
export interface LayoutRegion {
  /** Layout class name (e.g. "picture", "table", "text", "section_header"). */
  readonly className?: string
  /** Confidence score from the layout detection model (0.0 to 1.0). */
  readonly confidence?: number
  /** Bounding box in document coordinate space. */
  readonly boundingBox?: BoundingBox
  /** Fraction of the page area covered by this region (0.0 to 1.0). */
  readonly areaFraction?: number
}

/** Link element metadata. */
export interface LinkMetadata {
  /** The href URL value */
  readonly href: string
  /** Link text content (normalized) */
  readonly text: string
  /** Optional title attribute */
  readonly title?: string
  /** Link type classification */
  readonly linkType: LinkType
  /** Rel attribute values */
  readonly rel: Array<string>
  /** Additional attributes as key-value pairs */
  readonly attributes: Array<Array<string>>
}

/** Link type classification. */
export declare enum LinkType {
  /** Anchor link (#section) */
  Anchor = "anchor",
  /** Internal link (same domain) */
  Internal = "internal",
  /** External link (different domain) */
  External = "external",
  /** Email link (mailto:) */
  Email = "email",
  /** Phone link (tel:) */
  Phone = "phone",
  /** Other link type */
  Other = "other",
}

/** Type of list detection. */
export declare enum ListType {
  /** Bullet points (-, *, •, etc.) */
  Bullet = "Bullet",
  /** Numbered lists (1., 2., etc.) */
  Numbered = "Numbered",
  /** Lettered lists (a., b., A., B., etc.) */
  Lettered = "Lettered",
  /** Indented items */
  Indented = "Indented",
}

/**
 * Configuration for an LLM provider/model via liter-llm.
 *
 * Each feature (VLM OCR, VLM embeddings, structured extraction) carries
 * its own `LlmConfig`, allowing different providers per feature.
 * @example
 * ```typescript
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * api_key = "sk-..."  # or use KREUZBERG_LLM_API_KEY env var
 * ```typescript
 */
export interface LlmConfig {
  /**
   * Provider/model string using liter-llm routing format.
   *
   * Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
   * `"groq/llama-3.1-70b-versatile"`.
   */
  readonly model?: string
  /**
   * API key for the provider. When `None`, liter-llm falls back to
   * the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
   */
  readonly apiKey?: string
  /** Custom base URL override for the provider endpoint. */
  readonly baseUrl?: string
  /** Request timeout in seconds (default: 60). */
  readonly timeoutSecs?: number
  /** Maximum retry attempts (default: 3). */
  readonly maxRetries?: number
  /** Sampling temperature for generation tasks. */
  readonly temperature?: number
  /** Maximum tokens to generate. */
  readonly maxTokens?: number
}

/**
 * Token usage and cost data for a single LLM call made during extraction.
 *
 * Populated when VLM OCR, structured extraction, or LLM-based embeddings
 * are used. Multiple entries may be present when multiple LLM calls occur
 * within one extraction (e.g. VLM OCR + structured extraction).
 */
export interface LlmUsage {
  /** The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
  readonly model?: string
  /**
   * The pipeline stage that triggered this LLM call
   * (e.g. "vlm_ocr", "structured_extraction", "embeddings").
   */
  readonly source?: string
  /** Number of input/prompt tokens consumed. */
  readonly inputTokens?: number
  /** Number of output/completion tokens generated. */
  readonly outputTokens?: number
  /** Total tokens (input + output). */
  readonly totalTokens?: number
  /** Estimated cost in USD based on the provider's published pricing. */
  readonly estimatedCost?: number
  /** Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
  readonly finishReason?: string
}

/**
 * Extraction result metadata.
 *
 * Contains common fields applicable to all formats, format-specific metadata
 * via a discriminated union, and additional custom fields from postprocessors.
 */
export interface Metadata {
  /** Document title */
  readonly title?: string
  /** Document subject or description */
  readonly subject?: string
  /** Primary author(s) - always Vec for consistency */
  readonly authors?: Array<string>
  /** Keywords/tags - always Vec for consistency */
  readonly keywords?: Array<string>
  /** Primary language (ISO 639 code) */
  readonly language?: string
  /** Creation timestamp (ISO 8601 format) */
  readonly createdAt?: string
  /** Last modification timestamp (ISO 8601 format) */
  readonly modifiedAt?: string
  /** User who created the document */
  readonly createdBy?: string
  /** User who last modified the document */
  readonly modifiedBy?: string
  /** Page/slide/sheet structure with boundaries */
  readonly pages?: PageStructure
  /**
   * Format-specific metadata (discriminated union)
   *
   * Contains detailed metadata specific to the document format.
   * Serialized as a nested `"format"` object with a `format_type` discriminator field.
   */
  readonly format?: FormatMetadata
  /** Image preprocessing metadata (when OCR preprocessing was applied) */
  readonly imagePreprocessing?: ImagePreprocessingMetadata
  /** JSON schema (for structured data extraction) */
  readonly jsonSchema?: JsonValue
  /** Error metadata (for batch operations) */
  readonly error?: ErrorMetadata
  /**
   * Extraction duration in milliseconds (for benchmarking).
   *
   * This field is populated by batch extraction to provide per-file timing
   * information. It's `None` for single-file extraction (which uses external timing).
   */
  readonly extractionDurationMs?: number
  /** Document category (from frontmatter or classification). */
  readonly category?: string
  /** Document tags (from frontmatter). */
  readonly tags?: Array<string>
  /** Document version string (from frontmatter). */
  readonly documentVersion?: string
  /** Abstract or summary text (from frontmatter). */
  readonly abstractText?: string
  /**
   * Output format identifier (e.g., "markdown", "html", "text").
   *
   * Set by the output format pipeline stage when format conversion is applied.
   * Previously stored in `metadata.additional["output_format"]`.
   */
  readonly outputFormat?: string
  /**
   * Whether OCR was used during extraction.
   *
   * Set to `true` whenever the extraction pipeline ran an OCR backend
   * (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
   * or fallback text. `false` means native text extraction was used exclusively.
   */
  readonly ocrUsed?: boolean
  /**
   * Additional custom fields from postprocessors.
   *
   * Serialized as a nested `"additional"` object (not flattened at root level).
   * Uses `Cow<'static, str>` keys so static string keys avoid allocation.
   */
  readonly additional?: Record<string, JsonValue>
}

/** Combined paths to all models needed for OCR (backward compatibility). */
export interface ModelPaths {
  /** Path to the detection model directory. */
  readonly detModel: string
  /** Path to the classification model directory. */
  readonly clsModel: string
  /** Path to the recognition model directory. */
  readonly recModel: string
  /** Path to the character dictionary file. */
  readonly dictFile: string
}

/**
 * Tagged enum for node content. Each variant carries only type-specific data.
 *
 * Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
 * Go/Java/TypeScript bindings.
 */
export type NodeContent =
  | { node_type: 'title'; text: string }
  | { node_type: 'heading'; level: number; text: string }
  | { node_type: 'paragraph'; text: string }
  | { node_type: 'list'; ordered: boolean }
  | { node_type: 'list_item'; text: string }
  | { node_type: 'table'; grid: TableGrid }
  | { node_type: 'image'; description: string; imageIndex: number; src: string }
  | { node_type: 'code'; text: string; language: string }
  | { node_type: 'quote' }
  | { node_type: 'formula'; text: string }
  | { node_type: 'footnote'; text: string }
  | { node_type: 'group'; label: string; headingLevel: number; headingText: string }
  | { node_type: 'page_break' }
  | { node_type: 'slide'; number: number; title: string }
  | { node_type: 'definition_list' }
  | { node_type: 'definition_item'; term: string; definition: string }
  | { node_type: 'citation'; key: string; text: string }
  | { node_type: 'admonition'; kind: string; title: string }
  | { node_type: 'raw_block'; format: string; content: string }
  | { node_type: 'metadata_block'; entries: Array<Array<string>> }

/**
 * Trait for OCR backend plugins.
 *
 * Implement this trait to add custom OCR capabilities. OCR backends can be:
 * - Native Rust implementations (like Tesseract)
 * - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
 * - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
 *
 * # Thread Safety
 *
 * OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
 */
export interface OcrBackend {
  name(): string
  /**
   * Process an image and extract text via OCR.
   * @param image_bytes - Raw image data (JPEG, PNG, TIFF, etc.)
   *
   * @param config - OCR configuration (language, PSM mode, etc.)
   *
   * @returns An `ExtractionResult` containing the extracted text and metadata.
   *
   * @throws - `KreuzbergError::Ocr` - OCR processing failed
   * - `KreuzbergError::Validation` - Invalid image format or configuration
   * - `KreuzbergError::Io` - I/O errors (these always bubble up)
   *
   * # Reading `backend_options`
   *
   * Backends that support runtime tuning can read `config.backend_options` and
   * deserialize only the keys they care about. Unknown keys are silently ignored,
   * so multiple backends can coexist in a pipeline without key conflicts.
   *
   * ```rust
   * async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
   *     // Read backend-specific options; unknown keys are silently ignored.
   *     let fast_mode = config.backend_options
   *         .as_ref()
   *         .and_then(|v| v.get("mode"))
   *         .and_then(|v| v.as_str())
   *         .map(|s| s == "fast")
   *         .unwrap_or(false);
   *
   *     if image_bytes.is_empty() {
   *         return Err(kreuzberg::KreuzbergError::Validation {
   *             message: "Empty image data".to_string(),
   *             source: None,
   *         });
   *     }
   *
   *     let text = if fast_mode {
   *         "Fast OCR result".to_string()
   *     } else {
   *         format!("Extracted text in language: {}", config.language)
   *     };
   *
   *     Ok(ExtractionResult {
   *         content: text,
   *         mime_type: Cow::Borrowed("text/plain"),
   *         ..Default::default()
   *     })
   * }
   * ```
   */
  processImage(imageBytes: Uint8Array, config?: OcrConfig | undefined | null): Promise<string>
  /**
   * Process a file and extract text via OCR.
   *
   * Default implementation reads the file and calls `process_image`.
   * Override for custom file handling or optimizations.
   * @param path - Path to the image file
   *
   * @param config - OCR configuration
   *
   * @throws Same as `process_image`, plus file I/O errors.
   */
  processImageFile?(path: string, config?: OcrConfig | undefined | null): Promise<string>
  /**
   * Check if this backend supports a given language code.
   * @param lang - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
   *
   * @returns `true` if the language is supported, `false` otherwise.
   */
  supportsLanguage(lang: string): string
  /**
   * Get the backend type identifier.
   * @returns The backend type enum value.
   */
  backendType(): string
  /**
   * Optional: Get a list of all supported languages.
   *
   * Defaults to empty list. Override to provide comprehensive language support info.
   */
  supportedLanguages?(): string
  /**
   * Optional: Check if the backend supports table detection.
   *
   * Defaults to `false`. Override if your backend can detect and extract tables.
   */
  supportsTableDetection?(): string
  /**
   * Check if the backend supports direct document-level processing (e.g. for PDFs).
   *
   * Defaults to `false`. Override if the backend has optimized document processing.
   */
  supportsDocumentProcessing?(): string
  /**
   * Process a document file directly via OCR.
   *
   * Only called if `supports_document_processing` returns `true`.
   * @param path - Path to the document file (e.g. .pdf)
   *
   * @param config - OCR configuration
   */
  processDocument?(path: string, config?: OcrConfig | undefined | null): Promise<string>
}

/** OCR backend types. */
export declare enum OcrBackendType {
  /** Tesseract OCR (native Rust binding) */
  Tesseract = "Tesseract",
  /** EasyOCR (Python-based, via FFI) */
  EasyOCR = "EasyOCR",
  /** PaddleOCR (Python-based, via FFI) */
  PaddleOCR = "PaddleOCR",
  /** Custom/third-party OCR backend */
  Custom = "Custom",
}

/**
 * Bounding geometry for an OCR element.
 *
 * Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
 * (from PaddleOCR and rotated text detection).
 */
export type OcrBoundingGeometry =
  | { type: 'rectangle'; left: number; top: number; width: number; height: number }
  | { type: 'quadrilateral'; points: string }

/**
 * Confidence scores for an OCR element.
 *
 * Separates detection confidence (how confident that text exists at this location)
 * from recognition confidence (how confident about the actual text content).
 */
export interface OcrConfidence {
  /**
   * Detection confidence: how confident the OCR engine is that text exists here.
   *
   * PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
   * Range: 0.0 to 1.0 (or None if not available).
   */
  readonly detection?: number
  /**
   * Recognition confidence: how confident about the text content.
   *
   * Range: 0.0 to 1.0.
   */
  readonly recognition?: number
}

/** OCR configuration. */
export interface OcrConfig {
  /**
   * Whether OCR is enabled.
   *
   * Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
   * [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
   * metadata only; PDFs use native text extraction without OCR fallback.
   *
   * Defaults to `true`. When `false`, all other OCR settings are ignored.
   */
  readonly enabled?: boolean
  /** OCR backend: tesseract, easyocr, paddleocr */
  readonly backend?: string
  /** Language code (e.g., "eng", "deu") */
  readonly language?: string
  /** Tesseract-specific configuration (optional) */
  readonly tesseractConfig?: TesseractConfig
  /** Output format for OCR results (optional, for format conversion) */
  readonly outputFormat?: OutputFormat
  /** PaddleOCR-specific configuration (optional, JSON passthrough) */
  readonly paddleOcrConfig?: JsonValue
  /**
   * Arbitrary per-call options passed through to the backend unchanged.
   *
   * Custom OCR backends and built-in backends that support runtime tuning
   * can read this value and deserialize the keys they care about. Keys
   * unknown to the backend are silently ignored.
   *
   * This is the recommended extension point for per-call parameters that
   * are not covered by the typed fields above (e.g. mode switching,
   * preprocessing flags, inference batch size).
   *
   * **Scope:** when `pipeline` is `None`, this value is propagated to the
   * primary stage of the auto-constructed pipeline. When `pipeline` is
   * explicitly set, this field has **no effect** — the caller must set
   * `OcrPipelineStage.backend_options` directly on the relevant stage(s)
   * instead.
   *
   * Example:
   * ```json
   * { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
   * ```
   */
  readonly backendOptions?: JsonValue
  /** OCR element extraction configuration */
  readonly elementConfig?: OcrElementConfig
  /**
   * Quality thresholds for the native-text-to-OCR fallback decision.
   * When None, uses compiled defaults (matching previous hardcoded behavior).
   */
  readonly qualityThresholds?: OcrQualityThresholds
  /**
   * Multi-backend OCR pipeline configuration. When set, enables weighted
   * fallback across multiple OCR backends based on output quality.
   * When None, uses the single `backend` field (same as today).
   */
  readonly pipeline?: OcrPipelineConfig
  /**
   * Enable automatic page rotation based on orientation detection.
   *
   * When enabled, uses Tesseract's `DetectOrientationScript()` to detect
   * page orientation (0/90/180/270 degrees) before OCR. If the page is
   * rotated with high confidence, the image is corrected before recognition.
   * This is critical for handling rotated scanned documents.
   */
  readonly autoRotate?: boolean
  /**
   * VLM (Vision Language Model) OCR configuration.
   *
   * Required when `backend` is `"vlm"`. Uses liter-llm to send page
   * images to a vision model for text extraction.
   */
  readonly vlmConfig?: LlmConfig
  /**
   * Custom Jinja2 prompt template for VLM OCR.
   *
   * When `None`, uses the default template. Available variables:
   * - `{{ language }}` — The document language code (e.g., "eng", "deu").
   */
  readonly vlmPrompt?: string
  /**
   * Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
   *
   * Not user-configurable via config files — injected at runtime from
   * `ExtractionConfig::acceleration` before each `process_image` call.
   */
  readonly acceleration?: AccelerationConfig
  /**
   * Caller-supplied Tesseract `traineddata` bytes per language code.
   *
   * Primary use case is the WASM build, which has no filesystem and cannot
   * download tessdata at runtime. Native builds typically rely on
   * `TessdataManager` and ignore this field. When present, the WASM
   * Tesseract backend prefers these bytes over its compile-time-bundled
   * English data.
   *
   * Skipped by serde to keep config files small — supply via the typed API
   * at runtime.
   */
  readonly tessdataBytes?: Record<string, Uint8Array>
}

/**
 * A unified OCR element representing detected text with full metadata.
 *
 * This is the primary type for structured OCR output, preserving all information
 * from both Tesseract and PaddleOCR backends.
 */
export interface OcrElement {
  /** The recognized text content. */
  readonly text?: string
  /** Bounding geometry (rectangle or quadrilateral). */
  readonly geometry?: OcrBoundingGeometry
  /** Confidence scores for detection and recognition. */
  readonly confidence?: OcrConfidence
  /** Hierarchical level (word, line, block, page). */
  readonly level?: OcrElementLevel
  /** Rotation information (if detected). */
  readonly rotation?: OcrRotation
  /** Page number (1-indexed). */
  readonly pageNumber?: number
  /**
   * Parent element ID for hierarchical relationships.
   *
   * Only used for Tesseract output which has word -> line -> block hierarchy.
   */
  readonly parentId?: string
  /** Backend-specific metadata that doesn't fit the unified schema. */
  readonly backendMetadata?: Record<string, JsonValue>
}

/**
 * Configuration for OCR element extraction.
 *
 * Controls how OCR elements are extracted and filtered.
 */
export interface OcrElementConfig {
  /**
   * Whether to include OCR elements in the extraction result.
   *
   * When true, the `ocr_elements` field in `ExtractionResult` will be populated.
   */
  readonly includeElements?: boolean
  /**
   * Minimum hierarchical level to include.
   *
   * Elements below this level (e.g., words when min_level is Line) will be excluded.
   */
  readonly minLevel?: OcrElementLevel
  /**
   * Minimum recognition confidence threshold (0.0-1.0).
   *
   * Elements with confidence below this threshold will be filtered out.
   */
  readonly minConfidence?: number
  /**
   * Whether to build hierarchical relationships between elements.
   *
   * When true, `parent_id` fields will be populated based on spatial containment.
   * Only meaningful for Tesseract output.
   */
  readonly buildHierarchy?: boolean
}

/**
 * Hierarchical level of an OCR element.
 *
 * Maps to Tesseract's page segmentation hierarchy and provides
 * equivalent semantics for PaddleOCR.
 */
export declare enum OcrElementLevel {
  /** Individual word */
  Word = "word",
  /** Line of text (default for PaddleOCR) */
  Line = "line",
  /** Paragraph or text block */
  Block = "block",
  /** Page-level element */
  Page = "page",
}

/**
 * OCR extraction result.
 *
 * Result of performing OCR on an image or scanned document,
 * including recognized text and detected tables.
 */
export interface OcrExtractionResult {
  /** Recognized text content */
  readonly content: string
  /** Original MIME type of the processed image */
  readonly mimeType: string
  /** OCR processing metadata (confidence scores, language, etc.) */
  readonly metadata: Record<string, JsonValue>
  /** Tables detected and extracted via OCR */
  readonly tables: Array<OcrTable>
  /**
   * Structured OCR elements with bounding boxes and confidence scores.
   * Available when TSV output is requested or table detection is enabled.
   */
  readonly ocrElements?: Array<OcrElement>
  /**
   * Structured document produced from hOCR parsing.
   * Carries paragraph structure, bounding boxes, and confidence scores
   * that the flattened `content` string discards.
   */
  readonly internalDocument?: string
}

/**
 * OCR processing metadata.
 *
 * Captures information about OCR processing configuration and results.
 */
export interface OcrMetadata {
  /** OCR language code(s) used */
  readonly language?: string
  /** Tesseract Page Segmentation Mode (PSM) */
  readonly psm?: number
  /** Output format (e.g., "text", "hocr") */
  readonly outputFormat?: string
  /** Number of tables detected */
  readonly tableCount?: number
  readonly tableRows?: number
  readonly tableCols?: number
}

/**
 * Multi-backend OCR pipeline with quality-based fallback.
 *
 * Backends are tried in priority order (highest first). After each backend
 * produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
 * the result is accepted. Otherwise the next backend is tried.
 */
export interface OcrPipelineConfig {
  /** Ordered list of backends to try. Sorted by priority (descending) at runtime. */
  readonly stages: Array<OcrPipelineStage>
  /** Quality thresholds for deciding whether to accept a result or try the next backend. */
  readonly qualityThresholds: OcrQualityThresholds
}

/** A single backend stage in the OCR pipeline. */
export interface OcrPipelineStage {
  /** Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
  readonly backend: string
  /** Priority weight (higher = tried first). Stages are sorted by priority descending. */
  readonly priority: number
  /** Language override for this stage (None = use parent OcrConfig.language). */
  readonly language?: string
  /** Tesseract-specific config override for this stage. */
  readonly tesseractConfig?: TesseractConfig
  /** PaddleOCR-specific config for this stage. */
  readonly paddleOcrConfig?: JsonValue
  /** VLM config override for this pipeline stage. */
  readonly vlmConfig?: LlmConfig
  /**
   * Arbitrary per-call options passed through to the backend unchanged.
   *
   * Backends that support runtime tuning (mode switching, preprocessing
   * flags, inference parameters, etc.) read this value and deserialize
   * the keys they care about. Keys unknown to the backend are silently
   * ignored, so options from different backends can coexist in the same
   * config without conflict.
   *
   * Example (custom backend):
   * ```json
   * { "mode": "fast", "enable_layout": true }
   * ```
   */
  readonly backendOptions?: JsonValue
}

/**
 * Quality thresholds for OCR fallback decisions and pipeline quality gating.
 *
 * All fields default to the values that match the previous hardcoded behavior,
 * so `OcrQualityThresholds::default()` preserves existing semantics exactly.
 */
export interface OcrQualityThresholds {
  /** Minimum total non-whitespace characters to consider text substantive. */
  readonly minTotalNonWhitespace?: number
  /** Minimum non-whitespace characters per page on average. */
  readonly minNonWhitespacePerPage?: number
  /** Minimum character count for a word to be "meaningful". */
  readonly minMeaningfulWordLen?: number
  /** Minimum count of meaningful words before text is accepted. */
  readonly minMeaningfulWords?: number
  /** Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
  readonly minAlnumRatio?: number
  /** Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
  readonly minGarbageChars?: number
  /** Maximum fraction of short (1-2 char) words before text is considered fragmented. */
  readonly maxFragmentedWordRatio?: number
  /**
   * Critical fragmentation threshold — triggers OCR regardless of meaningful words.
   * Normal English text has ~20-30% short words. 80%+ is definitive garbage.
   */
  readonly criticalFragmentedWordRatio?: number
  /** Minimum average word length. Below this with enough words indicates garbled extraction. */
  readonly minAvgWordLength?: number
  /** Minimum word count before average word length check applies. */
  readonly minWordsForAvgLengthCheck?: number
  /** Minimum consecutive word repetition ratio to detect column scrambling. */
  readonly minConsecutiveRepeatRatio?: number
  /** Minimum word count before consecutive repetition check is applied. */
  readonly minWordsForRepeatCheck?: number
  /** Minimum character count for "substantive markdown" OCR skip gate. */
  readonly substantiveMinChars?: number
  /** Minimum character count for "non-text content" OCR skip gate. */
  readonly nonTextMinChars?: number
  /** Alphanumeric+whitespace ratio threshold for skip decisions. */
  readonly alnumWsRatioThreshold?: number
  /**
   * Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
   * If the result from a backend scores below this, try the next backend.
   */
  readonly pipelineMinQuality?: number
}

/** Rotation information for an OCR element. */
export interface OcrRotation {
  /** Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
  readonly angleDegrees: number
  /** Confidence score for the rotation detection. */
  readonly confidence?: number
}

/**
 * Table detected via OCR.
 *
 * Represents a table structure recognized during OCR processing.
 */
export interface OcrTable {
  /** Table cells as a 2D vector (rows × columns) */
  readonly cells: Array<Array<string>>
  /** Markdown representation of the table */
  readonly markdown: string
  /** Page number where the table was found (1-indexed) */
  readonly pageNumber: number
  /** Bounding box of the table in pixel coordinates (from OCR word positions). */
  readonly boundingBox?: OcrTableBoundingBox
}

/** Bounding box for an OCR-detected table in pixel coordinates. */
export interface OcrTableBoundingBox {
  /** Left x-coordinate (pixels) */
  readonly left: number
  /** Top y-coordinate (pixels) */
  readonly top: number
  /** Right x-coordinate (pixels) */
  readonly right: number
  /** Bottom y-coordinate (pixels) */
  readonly bottom: number
}

/** Document orientation detection result. */
export interface OrientationResult {
  /** Detected orientation in degrees (0, 90, 180, or 270). */
  readonly degrees: number
  /** Confidence score (0.0-1.0). */
  readonly confidence: number
}

/**
 * Output format for extraction results.
 *
 * Controls the format of the `content` field in `ExtractionResult`.
 * When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
 * `Plain` returns the raw extracted text.
 * `Structured` returns JSON with full OCR element data including bounding
 * boxes and confidence scores.
 */
export declare enum OutputFormat {
  /** Plain text content only (default) */
  Plain = "plain",
  /** Markdown format */
  Markdown = "markdown",
  /** Djot markup format */
  Djot = "djot",
  /** HTML format */
  Html = "html",
  /** JSON tree format with heading-driven sections. */
  Json = "json",
  /** Structured JSON format with full OCR element metadata. */
  Structured = "structured",
  /**
   * Custom renderer registered via the RendererRegistry.
   * The string is the renderer name (e.g., "docx", "latex").
   */
  Custom = "custom",
}

/**
 * Supported languages in PaddleOCR.
 *
 * Maps user-friendly language codes to paddle-ocr-rs language identifiers.
 */
export declare enum PaddleLanguage {
  /** English */
  English = "English",
  /** Simplified Chinese */
  Chinese = "Chinese",
  /** Japanese */
  Japanese = "Japanese",
  /** Korean */
  Korean = "Korean",
  /** German */
  German = "German",
  /** French */
  French = "French",
  /** Latin script (covers most European languages) */
  Latin = "Latin",
  /** Cyrillic (Russian and related) */
  Cyrillic = "Cyrillic",
  /** Traditional Chinese */
  TraditionalChinese = "TraditionalChinese",
  /** Thai */
  Thai = "Thai",
  /** Greek */
  Greek = "Greek",
  /** East Slavic (Russian, Ukrainian, Belarusian) */
  EastSlavic = "EastSlavic",
  /** Arabic (Arabic, Persian, Urdu) */
  Arabic = "Arabic",
  /** Devanagari (Hindi, Marathi, Sanskrit, Nepali) */
  Devanagari = "Devanagari",
  /** Tamil */
  Tamil = "Tamil",
  /** Telugu */
  Telugu = "Telugu",
}

/**
 * Configuration for PaddleOCR backend.
 *
 * Configures PaddleOCR text detection and recognition with multi-language support.
 * Uses a builder pattern for convenient configuration.
 */
export interface PaddleOcrConfig {
  /** Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
  readonly language?: string
  /** Optional custom cache directory for model files */
  readonly cacheDir?: string
  /**
   * Enable angle classification for rotated text (default: false).
   * Can misfire on short text regions, rotating crops incorrectly before recognition.
   */
  readonly useAngleCls?: boolean
  /** Enable table structure detection (default: false) */
  readonly enableTableDetection?: boolean
  /**
   * Database threshold for text detection (default: 0.3)
   * Range: 0.0-1.0, higher values require more confident detections
   */
  readonly detDbThresh?: number
  /**
   * Box threshold for text bounding box refinement (default: 0.5)
   * Range: 0.0-1.0
   */
  readonly detDbBoxThresh?: number
  /**
   * Unclip ratio for expanding text bounding boxes (default: 1.6)
   * Controls the expansion of detected text regions
   */
  readonly detDbUnclipRatio?: number
  /**
   * Maximum side length for detection image (default: 960)
   * Larger images may be resized to this limit for faster inference
   */
  readonly detLimitSideLen?: number
  /**
   * Batch size for recognition inference (default: 6)
   * Number of text regions to process simultaneously
   */
  readonly recBatchNum?: number
  /**
   * Padding in pixels added around the image before detection (default: 10).
   * Large values can include surrounding content like table gridlines.
   */
  readonly padding?: number
  /**
   * Minimum recognition confidence score for text lines (default: 0.5).
   * Text regions with recognition confidence below this threshold are discarded.
   * Matches PaddleOCR Python's `drop_score` parameter.
   * Range: 0.0-1.0
   */
  readonly dropScore?: number
  /**
   * Model tier controlling detection/recognition model size and accuracy trade-off.
   * - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
   * - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
   */
  readonly modelTier?: string
}

/**
 * Byte offset boundary for a page.
 *
 * Tracks where a specific page's content starts and ends in the main content string,
 * enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
 * at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
 */
export interface PageBoundary {
  /** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
  readonly byteStart: number
  /** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
  readonly byteEnd: number
  /** Page number (1-indexed) */
  readonly pageNumber: number
}

/**
 * Page extraction and tracking configuration.
 *
 * Controls how pages are extracted, tracked, and represented in the extraction results.
 * When `None`, page tracking is disabled.
 *
 * Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
 * when page boundaries are available and chunking is configured.
 */
export interface PageConfig {
  /** Extract pages as separate array (ExtractionResult.pages) */
  readonly extractPages?: boolean
  /** Insert page markers in main content string */
  readonly insertPageMarkers?: boolean
  /**
   * Page marker format (use {page_num} placeholder)
   * Default: "\n\n<!-- PAGE {page_num} -->\n\n"
   */
  readonly markerFormat?: string
}

/**
 * Content for a single page/slide.
 *
 * When page extraction is enabled, documents are split into per-page content
 * with associated tables and images mapped to each page.
 *
 * # Performance
 *
 * Uses Arc-wrapped tables and images for memory efficiency:
 * - `Vec<Arc<Table>>` enables zero-copy sharing of table data
 * - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
 * - Maintains exact JSON compatibility via custom Serialize/Deserialize
 *
 * This reduces memory overhead for documents with shared tables/images
 * by avoiding redundant copies during serialization.
 */
export interface PageContent {
  /** Page number (1-indexed) */
  readonly pageNumber: number
  /** Text content for this page */
  readonly content: string
  /**
   * Tables found on this page (uses Arc for memory efficiency)
   *
   * Serializes as Vec<Table> for JSON compatibility while maintaining
   * Arc semantics in-memory for zero-copy sharing.
   */
  readonly tables: Array<Table>
  /**
   * Indices into `ExtractionResult.images` for images found on this page.
   *
   * Each value is a zero-based index into the top-level `images` collection.
   * Only populated when `extract_images = true` in the extraction config.
   */
  readonly imageIndices: Array<number>
  /**
   * Hierarchy information for the page (when hierarchy extraction is enabled)
   *
   * Contains text hierarchy levels (H1-H6) extracted from the page content.
   */
  readonly hierarchy?: PageHierarchy
  /**
   * Whether this page is blank (no meaningful text content)
   *
   * Determined during extraction based on text content analysis.
   * A page is blank if it has fewer than 3 non-whitespace characters
   * and contains no tables or images.
   */
  readonly isBlank?: boolean
  /**
   * Layout detection regions for this page (when layout detection is enabled).
   *
   * Contains detected layout regions with class, confidence, bounding box,
   * and area fraction. Only populated when layout detection is configured.
   */
  readonly layoutRegions?: Array<LayoutRegion>
  /**
   * Speaker notes for this slide (PPTX only).
   *
   * Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
   * Only populated when the source is a PPTX file and notes are present.
   */
  readonly speakerNotes?: string
  /**
   * Section name this slide belongs to (PPTX only).
   *
   * PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
   * `ppt/presentation.xml`). Only populated when the source is a PPTX file and
   * the slide belongs to a named section.
   */
  readonly sectionName?: string
  /**
   * Sheet name for this page (XLSX/ODS only).
   *
   * Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
   * sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
   * formats and for sheets with an empty name.
   */
  readonly sheetName?: string
}

/**
 * Page hierarchy structure containing heading levels and block information.
 *
 * Used when PDF text hierarchy extraction is enabled. Contains hierarchical
 * blocks with heading levels (H1-H6) for semantic document structure.
 */
export interface PageHierarchy {
  /** Number of hierarchy blocks on this page */
  readonly blockCount: number
  /** Hierarchical blocks with heading levels */
  readonly blocks: Array<HierarchicalBlock>
}

/**
 * Metadata for individual page/slide/sheet.
 *
 * Captures per-page information including dimensions, content counts,
 * and visibility state (for presentations).
 */
export interface PageInfo {
  /** Page number (1-indexed) */
  readonly number: number
  /** Page title (usually for presentations) */
  readonly title?: string
  /** Dimensions in points (PDF) or pixels (images): (width, height) */
  readonly dimensions?: Array<number>
  /** Number of images on this page */
  readonly imageCount?: number
  /** Number of tables on this page */
  readonly tableCount?: number
  /** Whether this page is hidden (e.g., in presentations) */
  readonly hidden?: boolean
  /**
   * Whether this page is blank (no meaningful text, no images, no tables)
   *
   * A page is considered blank if it has fewer than 3 non-whitespace characters
   * and contains no tables or images. This is useful for filtering out empty pages
   * in scanned documents or PDFs with blank separator pages.
   */
  readonly isBlank?: boolean
  /**
   * Whether this page contains non-trivial vector graphics (paths, shapes, curves)
   *
   * Indicates the presence of vector-drawn content such as charts, diagrams,
   * or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
   * invisible to `ExtractionResult.images` since they are not embedded as raster
   * XObjects. Set to `true` when path count exceeds a heuristic threshold,
   * signaling that downstream consumers may want to rasterize the page to
   * capture this content.
   *
   * Only populated for PDFs; `None` for other document types.
   */
  readonly hasVectorGraphics: boolean
}

/**
 * Unified page structure for documents.
 *
 * Supports different page types (PDF pages, PPTX slides, Excel sheets)
 * with character offset boundaries for chunk-to-page mapping.
 */
export interface PageStructure {
  /** Total number of pages/slides/sheets */
  readonly totalCount: number
  /** Type of paginated unit */
  readonly unitType: PageUnitType
  /**
   * Character offset boundaries for each page
   *
   * Maps character ranges in the extracted content to page numbers.
   * Used for chunk page range calculation.
   */
  readonly boundaries?: Array<PageBoundary>
  /** Detailed per-page metadata (optional, only when needed) */
  readonly pages?: Array<PageInfo>
}

/**
 * Type of paginated unit in a document.
 *
 * Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
 */
export declare enum PageUnitType {
  /** Standard document pages (PDF, DOCX, images) */
  Page = "page",
  /** Presentation slides (PPTX, ODP) */
  Slide = "slide",
  /** Spreadsheet sheets (XLSX, ODS) */
  Sheet = "sheet",
}

/** A PDF annotation extracted from a document page. */
export interface PdfAnnotation {
  /** The type of annotation. */
  readonly annotationType: PdfAnnotationType
  /** Text content of the annotation (e.g., comment text, link URL). */
  readonly content?: string
  /** Page number where the annotation appears (1-indexed). */
  readonly pageNumber: number
  /** Bounding box of the annotation on the page. */
  readonly boundingBox?: BoundingBox
}

/** Type of PDF annotation. */
export declare enum PdfAnnotationType {
  /** Sticky note / text annotation */
  Text = "text",
  /** Highlighted text region */
  Highlight = "highlight",
  /** Hyperlink annotation */
  Link = "link",
  /** Rubber stamp annotation */
  Stamp = "stamp",
  /** Underline text markup */
  Underline = "underline",
  /** Strikeout text markup */
  StrikeOut = "strike_out",
  /** Any other annotation type */
  Other = "other",
}

/** PDF-specific configuration. */
export interface PdfConfig {
  /** Extract images from PDF */
  readonly extractImages?: boolean
  /**
   * Extract tables from PDF.
   *
   * When `true` (default), runs pdf_oxide's native grid detector and, if it
   * finds nothing, falls back to the heuristic text-layer reconstruction in
   * `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
   * both passes — `tables` will then be empty in the result.
   */
  readonly extractTables?: boolean
  /** List of passwords to try when opening encrypted PDFs */
  readonly passwords?: Array<string>
  /** Extract PDF metadata */
  readonly extractMetadata?: boolean
  /** Hierarchy extraction configuration (None = hierarchy extraction disabled) */
  readonly hierarchy?: HierarchyConfig
  /**
   * Extract PDF annotations (text notes, highlights, links, stamps).
   * Default: false
   */
  readonly extractAnnotations?: boolean
  /**
   * Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
   * Default: 0.06 (6%)
   */
  readonly topMarginFraction?: number
  /**
   * Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
   * Default: 0.05 (5%)
   */
  readonly bottomMarginFraction?: number
  /**
   * Allow single-column pseudo tables in extraction results.
   *
   * By default, tables with fewer than 2 columns (layout-guided) or 3 columns
   * (heuristic) are rejected. When `true`, the minimum column count is relaxed
   * to 1, allowing single-column structured data (glossaries, itemized lists)
   * to be emitted as tables. Other quality filters (density, sparsity, prose
   * detection) still apply.
   */
  readonly allowSingleColumnTables?: boolean
  /**
   * Perform OCR on inline images extracted from PDF pages and attach the
   * recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
   * to be available; if `ExtractionConfig.ocr` is `None` the extractor
   * falls back to `TesseractConfig::default()`. Per-image failures degrade
   * gracefully (the image is returned without OCR text rather than failing
   * the whole extraction). Default: `false`.
   */
  readonly ocrInlineImages?: boolean
}

/**
 * PDF-specific metadata.
 *
 * Contains metadata fields specific to PDF documents that are not in the common
 * `Metadata` structure. Common fields like title, authors, keywords, and dates
 * are at the `Metadata` level.
 */
export interface PdfMetadata {
  /** PDF version (e.g., "1.7", "2.0") */
  readonly pdfVersion?: string
  /** PDF producer (application that created the PDF) */
  readonly producer?: string
  /** Whether the PDF is encrypted/password-protected */
  readonly isEncrypted?: boolean
  /** First page width in points (1/72 inch) */
  readonly width?: number
  /** First page height in points (1/72 inch) */
  readonly height?: number
  /** Total number of pages in the PDF document */
  readonly pageCount?: number
}

/**
 * Base trait that all plugins must implement.
 *
 * This trait provides common functionality for plugin lifecycle management,
 * identification, and metadata.
 *
 * # Thread Safety
 *
 * All plugins must be `Send + Sync` to support concurrent usage across threads.
 */
export interface Plugin {
  /**
   * Returns the unique name/identifier for this plugin.
   *
   * The name should be:
   * - Unique across all plugins
   * - Lowercase with hyphens (e.g., "my-custom-plugin")
   * - URL-safe characters only
   */
  name(): string
  /**
   * Returns the semantic version of this plugin.
   *
   * Should follow semver format: `MAJOR.MINOR.PATCH`
   */
  version?(): string
  /**
   * Initialize the plugin.
   *
   * Called once when the plugin is registered. Use this to:
   * - Load configuration
   * - Initialize resources (connections, caches, etc.)
   * - Validate dependencies
   *
   * # Thread Safety
   *
   * This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
   * Plugins needing mutable state during initialization should use interior mutability
   * patterns (Mutex, RwLock, OnceCell, etc.).
   * @throws Should return an error if initialization fails. The plugin will not be
   * registered if this method returns an error.
   */
  initialize?(): void
  /**
   * Shutdown the plugin.
   *
   * Called when the plugin is being unregistered or the application is shutting down.
   * Use this to:
   * - Close connections
   * - Flush caches
   * - Release resources
   *
   * # Thread Safety
   *
   * This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
   * Plugins needing mutable state during shutdown should use interior mutability
   * patterns (Mutex, RwLock, etc.).
   * @throws Errors during shutdown are logged but don't prevent the shutdown process.
   */
  shutdown?(): void
  /**
   * Optional plugin description for debugging and logging.
   *
   * Defaults to empty string if not overridden.
   */
  description?(): string
  /**
   * Optional plugin author information.
   *
   * Defaults to empty string if not overridden.
   */
  author?(): string
}

/**
 * Trait for post-processor plugins.
 *
 * Post-processors transform or enrich extraction results after the initial
 * extraction is complete. They can:
 * - Clean and normalize text
 * - Add metadata (language, keywords, entities)
 * - Split content into chunks
 * - Score quality
 * - Apply custom transformations
 *
 * # Processing Order
 *
 * Post-processors are executed in stage order:
 * 1. **Early** - Language detection, entity extraction
 * 2. **Middle** - Keyword extraction, token reduction
 * 3. **Late** - Custom hooks, final validation
 *
 * Within each stage, processors are executed in registration order.
 *
 * # Error Handling
 *
 * Post-processor errors are non-fatal by default - they're captured in metadata
 * and execution continues. To make errors fatal, return an error from `process()`.
 *
 * # Thread Safety
 *
 * Post-processors must be thread-safe (`Send + Sync`).
 */
export interface PostProcessor {
  name(): string
  /**
   * Process an extraction result.
   *
   * Transform or enrich the extraction result. Can modify:
   * - `content` - The extracted text
   * - `metadata` - Add or update metadata fields
   * - `tables` - Modify or enhance table data
   * @param result - Mutable reference to the extraction result to process
   *
   * @param config - Extraction configuration
   *
   * @returns `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
   *
   * @throws Return errors for fatal processing failures. Non-fatal errors should be
   * captured in metadata directly on the result.
   *
   * # Performance
   *
   * This signature avoids unnecessary cloning of large extraction results by
   * taking a mutable reference instead of ownership. Processors modify the
   * result in place.
   *
   * # Example - Language Detection
   *
   * ```rust
   * async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
   *     -> Result<()> {
   *     // Detect language (simplified - use real detection library in practice)
   *     let language = "en"; // Placeholder detection
   *
   *     // Add to metadata
   *     result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
   *
   *     Ok(())
   * }
   * ```
   *
   * # Example - Text Cleaning
   *
   * ```rust
   * async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
   *     -> Result<()> {
   *     // Remove excessive whitespace
   *     result.content = result
   *         .content
   *         .split_whitespace()
   *         .collect::<Vec<_>>()
   *         .join(" ");
   *
   *     Ok(())
   * }
   * ```
   */
  process(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
  /**
   * Get the processing stage for this post-processor.
   *
   * Determines when this processor runs in the pipeline.
   * @returns The `ProcessingStage` (Early, Middle, or Late).
   */
  processingStage(): string
  /**
   * Optional: Check if this processor should run for a given result.
   *
   * Allows conditional processing based on MIME type, metadata, or content.
   * Defaults to `true` (always run).
   * @param result - The extraction result to check
   *
   * @param config - Extraction configuration
   *
   * @returns `true` if the processor should run, `false` to skip.
   */
  shouldProcess?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
  /**
   * Optional: Estimate processing time in milliseconds.
   *
   * Used for logging and debugging. Defaults to 0 (unknown).
   * @param result - The extraction result to estimate for
   *
   * @returns Estimated processing time in milliseconds.
   */
  estimatedDurationMs?(result?: ExtractionResult | undefined | null): string
  /**
   * Execution priority within the processing stage.
   *
   * Higher values run first within the same `ProcessingStage`. Defaults to 50.
   * Use 0-49 for fallback processors, 50 for normal processors, and 51-255
   * for high-priority processors that should run early in their stage.
   */
  priority?(): string
}

/** Post-processor configuration. */
export interface PostProcessorConfig {
  /** Enable post-processors */
  readonly enabled?: boolean
  /** Whitelist of processor names to run (None = all enabled) */
  readonly enabledProcessors?: Array<string>
  /** Blacklist of processor names to skip (None = none disabled) */
  readonly disabledProcessors?: Array<string>
  /** Pre-computed AHashSet for O(1) enabled processor lookup */
  readonly enabledSet?: Array<string>
  /** Pre-computed AHashSet for O(1) disabled processor lookup */
  readonly disabledSet?: Array<string>
}

/**
 * Application properties from docProps/app.xml for PPTX
 *
 * Contains PowerPoint-specific document metadata.
 */
export interface PptxAppProperties {
  /** Application name (e.g., "Microsoft Office PowerPoint") */
  readonly application?: string
  /** Application version */
  readonly appVersion?: string
  /** Total editing time in minutes */
  readonly totalTime?: number
  /** Company name */
  readonly company?: string
  /** Document security level */
  readonly docSecurity?: number
  /** Scale crop flag */
  readonly scaleCrop?: boolean
  /** Links up to date flag */
  readonly linksUpToDate?: boolean
  /** Shared document flag */
  readonly sharedDoc?: boolean
  /** Hyperlinks changed flag */
  readonly hyperlinksChanged?: boolean
  /** Number of slides */
  readonly slides?: number
  /** Number of notes */
  readonly notes?: number
  /** Number of hidden slides */
  readonly hiddenSlides?: number
  /** Number of multimedia clips */
  readonly multimediaClips?: number
  /** Presentation format (e.g., "Widescreen", "Standard") */
  readonly presentationFormat?: string
  /** Slide titles */
  readonly slideTitles?: Array<string>
}

/**
 * PowerPoint (PPTX) extraction result.
 *
 * Contains extracted slide content, metadata, and embedded images/tables.
 */
export interface PptxExtractionResult {
  /** Extracted text content from all slides */
  readonly content: string
  /** Presentation metadata */
  readonly metadata: PptxMetadata
  /** Total number of slides */
  readonly slideCount: number
  /** Total number of embedded images */
  readonly imageCount: number
  /** Total number of tables */
  readonly tableCount: number
  /** Extracted images from the presentation */
  readonly images: Array<ExtractedImage>
  /** Slide structure with boundaries (when page tracking is enabled) */
  readonly pageStructure?: PageStructure
  /** Per-slide content (when page tracking is enabled) */
  readonly pageContents?: Array<PageContent>
  /** Structured document representation */
  readonly document?: DocumentStructure
  /** Hyperlinks discovered in slides as (url, optional_label) pairs. */
  readonly hyperlinks: Array<string>
  /**
   * Office metadata extracted from docProps/core.xml and docProps/app.xml.
   *
   * Contains keys like "title", "author", "created_by", "subject", "keywords",
   * "modified_by", "created_at", "modified_at", etc.
   */
  readonly officeMetadata: Record<string, string>
  /**
   * Slide comments as revisions.
   *
   * Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
   * `DocumentRevision { kind: Comment }` with author (resolved from
   * `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
   * `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
   */
  readonly revisions?: Array<DocumentRevision>
}

/**
 * PowerPoint presentation metadata.
 *
 * Extracted from PPTX files containing slide counts and presentation details.
 */
export interface PptxMetadata {
  /** Total number of slides in the presentation */
  readonly slideCount?: number
  /** Names of slides (if available) */
  readonly slideNames?: Array<string>
  /** Number of embedded images */
  readonly imageCount?: number
  /** Number of tables */
  readonly tableCount?: number
}

/**
 * Processing stages for post-processors.
 *
 * Post-processors are executed in stage order (Early → Middle → Late).
 * Use stages to control the order of post-processing operations.
 */
export declare enum ProcessingStage {
  /**
   * Early stage - foundational processing.
   *
   * Use for:
   * - Language detection
   * - Character encoding normalization
   * - Entity extraction (NER)
   * - Text quality scoring
   */
  Early = "Early",
  /**
   * Middle stage - content transformation.
   *
   * Use for:
   * - Keyword extraction
   * - Token reduction
   * - Text summarization
   * - Semantic analysis
   */
  Middle = "Middle",
  /**
   * Late stage - final enrichment.
   *
   * Use for:
   * - Custom user hooks
   * - Analytics/logging
   * - Final validation
   * - Output formatting
   */
  Late = "Late",
}

/**
 * A non-fatal warning from a processing pipeline stage.
 *
 * Captures errors from optional features that don't prevent extraction
 * but may indicate degraded results.
 */
export interface ProcessingWarning {
  /**
   * The pipeline stage or feature that produced this warning
   * (e.g., "embedding", "chunking", "language_detection", "output_format").
   */
  readonly source: string
  /** Human-readable description of what went wrong. */
  readonly message: string
}

/** Page Segmentation Mode for Tesseract OCR */
export declare enum PSMMode {
  OsdOnly = "OsdOnly",
  AutoOsd = "AutoOsd",
  AutoOnly = "AutoOnly",
  Auto = "Auto",
  SingleColumn = "SingleColumn",
  SingleBlockVertical = "SingleBlockVertical",
  SingleBlock = "SingleBlock",
  SingleLine = "SingleLine",
  SingleWord = "SingleWord",
  CircleWord = "CircleWord",
  SingleChar = "SingleChar",
}

/** Outlook PST archive metadata. */
export interface PstMetadata {
  readonly messageCount?: number
}

/** RAKE-specific parameters. */
export interface RakeParams {
  /** Minimum word length to consider (default: 1). */
  readonly minWordLength?: number
  /** Maximum words in a keyword phrase (default: 3). */
  readonly maxWordsPerPhrase?: number
}

/**
 * Pre-computed table markdown for a table detection region.
 *
 * Produced by the TATR-based table structure recognizer and surfaced as part of
 * layout-aware OCR results.  The struct lives here (under `layout-types`, pure-Rust)
 * so that consumers who do not enable `layout-detection` (ORT) can still reference
 * the type in their own code.
 */
export interface RecognizedTable {
  /** Detection bbox that this table corresponds to (for matching). */
  readonly detectionBbox: BBox
  /** Table cells as a 2D vector (rows × columns). */
  readonly cells: Array<Array<string>>
  /** Rendered markdown table. */
  readonly markdown: string
}

export declare enum ReductionLevel {
  Off = "Off",
  Light = "Light",
  Moderate = "Moderate",
  Aggressive = "Aggressive",
  Maximum = "Maximum",
}

/** Semantic kind of a relationship between document elements. */
export declare enum RelationshipKind {
  /** Footnote marker -> footnote definition. */
  FootnoteReference = "footnote_reference",
  /** Citation marker -> bibliography entry. */
  CitationReference = "citation_reference",
  /** Internal anchor link (`#id`) -> target heading/element. */
  InternalLink = "internal_link",
  /** Caption paragraph -> figure/table it describes. */
  Caption = "caption",
  /** Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`). */
  Label = "label",
  /** TOC entry -> target section. */
  TocEntry = "toc_entry",
  /** Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field). */
  CrossReference = "cross_reference",
}

/**
 * Trait for document renderers that convert [`InternalDocument`] to output strings.
 *
 * Renderers are typically stateless converters that transform the internal
 * document representation into a specific output format (Markdown, HTML,
 * Djot, plain text, etc.). They participate in the standard [`Plugin`]
 * lifecycle so custom renderers can be registered from any supported binding
 * language.
 *
 * The format name is exposed via [`Plugin::name`]. For stateless renderers
 * the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
 * take no-op defaults and need not be overridden.
 *
 * # Thread Safety
 *
 * Renderers must be `Send + Sync` (inherited from [`Plugin`]).
 */
export interface Renderer {
  name(): string
  /**
   * Render an [`InternalDocument`] to the output format.
   * @param doc - The internal document to render
   *
   * @returns The rendered output as a string.
   *
   * @throws Returns an error if rendering fails.
   */
  render(doc: InternalDocument): string
}

/**
 * Result-shape selection for extraction results.
 *
 * Distinct from `OutputFormat` (which controls rendering — Plain, Markdown,
 * HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
 * blob vs. an element-based decomposition.
 */
export declare enum ResultFormat {
  /** Unified format with all content in `content` field */
  Unified = "unified",
  /** Element-based format with semantic element extraction */
  ElementBased = "element_based",
}

/** Best-effort document location for a revision. */
export type RevisionAnchor =
  | { type: 'paragraph'; index: number }
  | { type: 'table_cell'; row: number; col: number; tableIndex: number }
  | { type: 'page'; index: number }
  | { type: 'slide'; index: number }
  | { type: 'sheet'; index: number; name: string }

/**
 * The content changes that make up a single revision.
 *
 * For insertions and deletions the `content` field carries the added/removed
 * lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
 * changes, `content` is empty — the property diff is left as a TODO for a
 * later enrichment pass.
 */
export interface RevisionDelta {
  /** Line-level content changes for this revision. */
  readonly content?: Array<DiffLine>
  /** Cell-level table changes for this revision. */
  readonly tableChanges?: Array<CellChange>
}

/** Semantic classification of a tracked change. */
export declare enum RevisionKind {
  /** Text or content was inserted. */
  Insertion = "insertion",
  /** Text or content was deleted. */
  Deletion = "deletion",
  /** Run-level formatting (font, size, colour, …) was changed. */
  FormatChange = "format_change",
  /** A reviewer comment or annotation. */
  Comment = "comment",
}

/**
 * Configuration for security limits across extractors.
 *
 * All limits are intentionally conservative to prevent DoS attacks
 * while still supporting legitimate documents.
 */
export interface SecurityLimits {
  /** Maximum uncompressed size for archives (500 MB) */
  readonly maxArchiveSize?: number
  /** Maximum compression ratio before flagging as potential bomb (100:1) */
  readonly maxCompressionRatio?: number
  /** Maximum number of files in archive (10,000) */
  readonly maxFilesInArchive?: number
  /** Maximum nesting depth for structures (100) */
  readonly maxNestingDepth?: number
  /**
   * Maximum length of any single XML entity / attribute / token (1 MiB).
   * This is a per-token cap, NOT a total cap — billion-laughs class
   * attacks where a single entity expands to hundreds of MB are caught
   * here, while normal long text content (a paragraph, a CDATA block) is
   * caught by `max_content_size` instead.
   */
  readonly maxEntityLength?: number
  /** Maximum string growth per document (100 MB) */
  readonly maxContentSize?: number
  /** Maximum iterations per operation */
  readonly maxIterations?: number
  /** Maximum XML depth (100 levels) */
  readonly maxXmlDepth?: number
  /** Maximum cells per table (100,000) */
  readonly maxTableCells?: number
}

/**
 * API server configuration.
 *
 * This struct holds all configuration options for the Kreuzberg API server,
 * including host/port settings, CORS configuration, and upload limits.
 *
 * # Defaults
 *
 * - `host`: "127.0.0.1" (localhost only)
 * - `port`: 8000
 * - `cors_origins`: empty vector (allows all origins)
 * - `max_request_body_bytes`: 104_857_600 (100 MB)
 * - `max_multipart_field_bytes`: 104_857_600 (100 MB)
 */
export interface ServerConfig {
  /** Server host address (e.g., "127.0.0.1", "0.0.0.0") */
  readonly host?: string
  /** Server port number */
  readonly port?: number
  /**
   * CORS allowed origins. Empty vector means allow all origins.
   *
   * If this is an empty vector, the server will accept requests from any origin.
   * If populated with specific origins (e.g., `"https://example.com"`), only
   * those origins will be allowed.
   */
  readonly corsOrigins?: Array<string>
  /** Maximum size of request body in bytes (default: 100 MB) */
  readonly maxRequestBodyBytes?: number
  /** Maximum size of multipart fields in bytes (default: 100 MB) */
  readonly maxMultipartFieldBytes?: number
}

/** Structured data (Schema.org, microdata, RDFa) block. */
export interface StructuredData {
  /** Type of structured data */
  readonly dataType: StructuredDataType
  /** Raw JSON string representation */
  readonly rawJson: string
  /** Schema type if detectable (e.g., "Article", "Event", "Product") */
  readonly schemaType?: string
}

export interface StructuredDataResult {
  readonly content: string
  readonly format: string
  readonly metadata: Record<string, string>
  readonly textFields: Array<string>
}

/** Structured data type classification. */
export declare enum StructuredDataType {
  /** JSON-LD structured data */
  JsonLd = "json-ld",
  /** Microdata */
  Microdata = "microdata",
  /** RDFa */
  RDFa = "rdfa",
}

/**
 * Configuration for LLM-based structured data extraction.
 *
 * Sends extracted document content to a VLM with a JSON schema,
 * returning structured data that conforms to the schema.
 * @example
 * ```typescript
 * [structured_extraction]
 * schema_name = "invoice_data"
 * strict = true
 *
 * [structured_extraction.schema]
 * type = "object"
 * properties.vendor = { type = "string" }
 * properties.total = { type = "number" }
 * required = ["vendor", "total"]
 *
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * ```typescript
 */
export interface StructuredExtractionConfig {
  /** JSON Schema defining the desired output structure. */
  readonly schema: JsonValue
  /** Schema name passed to the LLM's structured output mode. */
  readonly schemaName: string
  /** Optional schema description for the LLM. */
  readonly schemaDescription?: string
  /** Enable strict mode — output must exactly match the schema. */
  readonly strict: boolean
  /**
   * Custom Jinja2 extraction prompt template. When `None`, a default template is used.
   *
   * Available template variables:
   * - `{{ content }}` — The extracted document text.
   * - `{{ schema }}` — The JSON schema as a formatted string.
   * - `{{ schema_name }}` — The schema name.
   * - `{{ schema_description }}` — The schema description (may be empty).
   */
  readonly prompt?: string
  /** LLM configuration for the extraction. */
  readonly llm: LlmConfig
}

/**
 * A supported document format entry.
 *
 * Represents a file extension and its corresponding MIME type that Kreuzberg can process.
 */
export interface SupportedFormat {
  /** File extension (without leading dot), e.g., "pdf", "docx" */
  readonly extension: string
  /** MIME type string, e.g., "application/pdf" */
  readonly mimeType: string
}

/**
 * Extracted table structure.
 *
 * Represents a table detected and extracted from a document (PDF, image, etc.).
 * Tables are converted to both structured cell data and Markdown format.
 */
export interface Table {
  /** Table cells as a 2D vector (rows × columns) */
  readonly cells?: Array<Array<string>>
  /** Markdown representation of the table */
  readonly markdown?: string
  /** Page number where the table was found (1-indexed) */
  readonly pageNumber?: number
  /**
   * Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
   * Only populated for PDF-extracted tables when position data is available.
   */
  readonly boundingBox?: BoundingBox
}

/**
 * Individual table cell with content and optional styling.
 *
 * Future extension point for rich table support with cell-level metadata.
 */
export interface TableCell {
  /** Cell content as text */
  readonly content?: string
  /** Row span (number of rows this cell spans) */
  readonly rowSpan?: number
  /** Column span (number of columns this cell spans) */
  readonly colSpan?: number
  /** Whether this is a header cell */
  readonly isHeader?: boolean
}

/** Cell-level changes for a pair of tables that share the same index. */
export interface TableDiff {
  /** Zero-based index of the table in both `a.tables` and `b.tables`. */
  readonly fromIndex: number
  /** Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
  readonly toIndex: number
  /** Cell-level changes within the table. */
  readonly cellChanges: Array<CellChange>
}

/**
 * Structured table grid with cell-level metadata.
 *
 * Stores row/column dimensions and a flat list of cells with position info.
 */
export interface TableGrid {
  /** Number of rows in the table. */
  readonly rows?: number
  /** Number of columns in the table. */
  readonly cols?: number
  /** All cells in row-major order. */
  readonly cells?: Array<GridCell>
}

/**
 * Which table structure recognition model to use.
 *
 * Controls the model used for table cell detection within layout-detected
 * table regions. Wire format is snake_case in all serializers (JSON, TOML,
 * YAML).
 */
export declare enum TableModel {
  /** TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection. */
  Tatr = "tatr",
  /** SLANeXT wired variant -- 365MB, optimized for bordered tables. */
  SlanetWired = "slanet_wired",
  /** SLANeXT wireless variant -- 365MB, optimized for borderless tables. */
  SlanetWireless = "slanet_wireless",
  /** SLANet-plus -- 7.78MB, lightweight general-purpose. */
  SlanetPlus = "slanet_plus",
  /**
   * Classifier-routed SLANeXT: auto-select wired/wireless per table.
   * Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
   */
  SlanetAuto = "slanet_auto",
  /** Disable table structure model inference entirely; use heuristic path only. */
  Disabled = "disabled",
}

/**
 * Tesseract OCR configuration.
 *
 * Provides fine-grained control over Tesseract OCR engine parameters.
 * Most users can use the defaults, but these settings allow optimization
 * for specific document types (invoices, handwriting, etc.).
 */
export interface TesseractConfig {
  /** Language code (e.g., "eng", "deu", "fra") */
  readonly language?: string
  /**
   * Page Segmentation Mode (0-13).
   *
   * Common values:
   * - 3: Fully automatic page segmentation (native default)
   * - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
   * - 11: Sparse text with no particular order
   */
  readonly psm?: number
  /** Output format ("text" or "markdown") */
  readonly outputFormat?: string
  /**
   * OCR Engine Mode (0-3).
   *
   * - 0: Legacy engine only
   * - 1: Neural nets (LSTM) only (usually best)
   * - 2: Legacy + LSTM
   * - 3: Default (based on what's available)
   */
  readonly oem?: number
  /**
   * Minimum confidence threshold (0.0-100.0).
   *
   * Words with confidence below this threshold may be rejected or flagged.
   */
  readonly minConfidence?: number
  /**
   * Image preprocessing configuration.
   *
   * Controls how images are preprocessed before OCR. Can significantly
   * improve quality for scanned documents or low-quality images.
   */
  readonly preprocessing?: ImagePreprocessingConfig
  /** Enable automatic table detection and reconstruction */
  readonly enableTableDetection?: boolean
  /** Minimum confidence threshold for table detection (0.0-1.0) */
  readonly tableMinConfidence?: number
  /** Column threshold for table detection (pixels) */
  readonly tableColumnThreshold?: number
  /** Row threshold ratio for table detection (0.0-1.0) */
  readonly tableRowThresholdRatio?: number
  /** Enable OCR result caching */
  readonly useCache?: boolean
  /** Use pre-adapted templates for character classification */
  readonly classifyUsePreAdaptedTemplates?: boolean
  /** Enable N-gram language model */
  readonly languageModelNgramOn?: boolean
  /** Don't reject good words during block-level processing */
  readonly tesseditDontBlkrejGoodWds?: boolean
  /** Don't reject good words during row-level processing */
  readonly tesseditDontRowrejGoodWds?: boolean
  /** Enable dictionary correction */
  readonly tesseditEnableDictCorrection?: boolean
  /** Whitelist of allowed characters (empty = all allowed) */
  readonly tesseditCharWhitelist?: string
  /** Blacklist of forbidden characters (empty = none forbidden) */
  readonly tesseditCharBlacklist?: string
  /** Use primary language params model */
  readonly tesseditUsePrimaryParamsModel?: boolean
  /** Variable-width space detection */
  readonly textordSpaceSizeIsVariable?: boolean
  /** Use adaptive thresholding method */
  readonly thresholdingMethod?: boolean
}

/**
 * Inline text annotation — byte-range based formatting and links.
 *
 * Annotations reference byte offsets into the node's text content,
 * enabling precise identification of formatted regions.
 */
export interface TextAnnotation {
  /** Start byte offset in the node's text content (inclusive). */
  readonly start: number
  /** End byte offset in the node's text content (exclusive). */
  readonly end: number
  /** Annotation type. */
  readonly kind: AnnotationKind
}

/** Text direction enumeration for HTML documents. */
export declare enum TextDirection {
  /** Left-to-right text direction */
  LeftToRight = "ltr",
  /** Right-to-left text direction */
  RightToLeft = "rtl",
  /** Automatic text direction detection */
  Auto = "auto",
}

/**
 * Plain text and Markdown extraction result.
 *
 * Contains the extracted text along with statistics and,
 * for Markdown files, structural elements like headers and links.
 */
export interface TextExtractionResult {
  /** Extracted text content */
  readonly content: string
  /** Number of lines */
  readonly lineCount: number
  /** Number of words */
  readonly wordCount: number
  /** Number of characters */
  readonly characterCount: number
  /** Markdown headers (text only, Markdown files only) */
  readonly headers?: Array<string>
  /** Markdown links as (text, URL) tuples (Markdown files only) */
  readonly links?: Array<Array<string>>
  /** Code blocks as (language, code) tuples (Markdown files only) */
  readonly codeBlocks?: Array<Array<string>>
}

/**
 * Text/Markdown metadata.
 *
 * Extracted from plain text and Markdown files. Includes word counts and,
 * for Markdown, structural elements like headers and links.
 */
export interface TextMetadata {
  /** Number of lines in the document */
  readonly lineCount?: number
  /** Number of words */
  readonly wordCount?: number
  /** Number of characters */
  readonly characterCount?: number
  /** Markdown headers (headings text only, for Markdown files) */
  readonly headers?: Array<string>
  /** Markdown links as (text, url) tuples (for Markdown files) */
  readonly links?: Array<Array<string>>
  /** Code blocks as (language, code) tuples (for Markdown files) */
  readonly codeBlocks?: Array<Array<string>>
}

export interface TokenReductionConfig {
  readonly level?: ReductionLevel
  readonly languageHint?: string
  readonly preserveMarkdown?: boolean
  readonly preserveCode?: boolean
  readonly semanticThreshold?: number
  readonly enableParallel?: boolean
  readonly useSimd?: boolean
  readonly customStopwords?: Record<string, Array<string>>
  readonly preservePatterns?: Array<string>
  readonly targetReduction?: number
  readonly enableSemanticClustering?: boolean
}

/** Token reduction configuration. */
export interface TokenReductionOptions {
  /** Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
  readonly mode?: string
  /** Preserve important words (capitalized, technical terms) */
  readonly preserveImportantWords?: boolean
}

/**
 * Configuration for tree-sitter language pack integration.
 *
 * Controls grammar download behavior and code analysis options.
 *
 * # Example (TOML)
 *
 * ```toml
 * [tree_sitter]
 * languages = ["python", "rust"]
 * groups = ["web"]
 *
 * [tree_sitter.process]
 * structure = true
 * comments = true
 * docstrings = true
 * ```
 */
export interface TreeSitterConfig {
  /**
   * Enable code intelligence processing (default: true).
   *
   * When `false`, tree-sitter analysis is completely skipped even if
   * the config section is present.
   */
  readonly enabled?: boolean
  /**
   * Custom cache directory for downloaded grammars.
   *
   * When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
   */
  readonly cacheDir?: string
  /** Languages to pre-download on init (e.g., `["python", "rust"]`). */
  readonly languages?: Array<string>
  /** Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
  readonly groups?: Array<string>
  /** Processing options for code analysis. */
  readonly process?: TreeSitterProcessConfig
}

/**
 * Processing options for tree-sitter code analysis.
 *
 * Controls which analysis features are enabled when extracting code files.
 */
export interface TreeSitterProcessConfig {
  /** Extract structural items (functions, classes, structs, etc.). Default: true. */
  readonly structure?: boolean
  /** Extract import statements. Default: true. */
  readonly imports?: boolean
  /** Extract export statements. Default: true. */
  readonly exports?: boolean
  /** Extract comments. Default: false. */
  readonly comments?: boolean
  /** Extract docstrings. Default: false. */
  readonly docstrings?: boolean
  /** Extract symbol definitions. Default: false. */
  readonly symbols?: boolean
  /** Include parse diagnostics. Default: false. */
  readonly diagnostics?: boolean
  /** Maximum chunk size in bytes. `None` disables chunking. */
  readonly chunkMaxSize?: number
  /** Content rendering mode for code extraction. */
  readonly contentMode?: CodeContentMode
}

/** Semantic classification of an extracted URI. */
export declare enum UriKind {
  /** A clickable hyperlink (web URL, file link). */
  Hyperlink = "hyperlink",
  /** An image or media resource reference. */
  Image = "image",
  /** An internal anchor or cross-reference target. */
  Anchor = "anchor",
  /** A citation or bibliographic reference (DOI, academic ref). */
  Citation = "citation",
  /** A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST). */
  Reference = "reference",
  /** An email address (`mailto:` link or bare email). */
  Email = "email",
}

/**
 * Trait for validator plugins.
 *
 * Validators check extraction results for quality, completeness, or correctness.
 * Unlike post-processors, validator errors **fail fast** - if a validator returns
 * an error, the extraction fails immediately.
 *
 * # Use Cases
 *
 * - **Quality Gates**: Ensure extracted content meets minimum quality standards
 * - **Compliance**: Verify content meets regulatory requirements
 * - **Content Filtering**: Reject documents containing unwanted content
 * - **Format Validation**: Verify extracted content structure
 * - **Security Checks**: Scan for malicious content
 *
 * # Error Handling
 *
 * Validator errors are **fatal** - they cause the extraction to fail and bubble up
 * to the caller. Use validators for hard requirements that must be met.
 *
 * For non-fatal checks, use post-processors instead.
 *
 * # Thread Safety
 *
 * Validators must be thread-safe (`Send + Sync`).
 */
export interface Validator {
  name(): string
  /**
   * Validate an extraction result.
   *
   * Check the extraction result and return `Ok(())` if valid, or an error
   * if validation fails.
   * @param result - The extraction result to validate
   *
   * @param config - Extraction configuration
   *
   * @returns - `Ok(())` if validation passes
   * - `Err(...)` if validation fails (extraction will fail)
   *
   * @throws - `KreuzbergError::Validation` - Validation failed
   * - Any other error type appropriate for the failure
   *
   * # Example - Content Length Validation
   *
   * ```rust
   * async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
   *     -> Result<()> {
   *     let length = result.content.len();
   *
   *     if length < self.min {
   *         return Err(KreuzbergError::validation(format!(
   *             "Content too short: {} < {} characters",
   *             length, self.min
   *         )));
   *     }
   *
   *     if length > self.max {
   *         return Err(KreuzbergError::validation(format!(
   *             "Content too long: {} > {} characters",
   *             length, self.max
   *         )));
   *     }
   *
   *     Ok(())
   * }
   * ```
   *
   * # Example - Quality Score Validation
   *
   * ```rust
   * async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
   *     -> Result<()> {
   *     // Check if quality_score exists in metadata
   *     let score = result.metadata
   *         .additional
   *         .get("quality_score")
   *         .and_then(|v| v.as_f64())
   *         .unwrap_or(0.0);
   *
   *     if score < self.min_score {
   *         return Err(KreuzbergError::validation(format!(
   *             "Quality score too low: {} < {}",
   *             score, self.min_score
   *         )));
   *     }
   *
   *     Ok(())
   * }
   * ```
   *
   * # Example - Security Validation
   *
   * ```rust
   * async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
   *     -> Result<()> {
   *     // Check for blocked patterns
   *     for pattern in &self.blocked_patterns {
   *         if result.content.contains(pattern) {
   *             return Err(KreuzbergError::validation(format!(
   *                 "Content contains blocked pattern: {}",
   *                 pattern
   *             )));
   *         }
   *     }
   *
   *     Ok(())
   * }
   * ```
   */
  validate(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
  /**
   * Optional: Check if this validator should run for a given result.
   *
   * Allows conditional validation based on MIME type, metadata, or content.
   * Defaults to `true` (always run).
   * @param result - The extraction result to check
   *
   * @param config - Extraction configuration
   *
   * @returns `true` if the validator should run, `false` to skip.
   */
  shouldValidate?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
  /**
   * Optional: Get the validation priority.
   *
   * Higher priority validators run first. Useful for ordering validation checks
   * (e.g., run cheap validations before expensive ones).
   *
   * Default priority is 50.
   * @returns Priority value (higher = runs earlier).
   */
  priority?(): string
}

/**
 * Application properties from docProps/app.xml for XLSX
 *
 * Contains Excel-specific document metadata.
 */
export interface XlsxAppProperties {
  /** Application name (e.g., "Microsoft Excel") */
  readonly application?: string
  /** Application version */
  readonly appVersion?: string
  /** Document security level */
  readonly docSecurity?: number
  /** Scale crop flag */
  readonly scaleCrop?: boolean
  /** Links up to date flag */
  readonly linksUpToDate?: boolean
  /** Shared document flag */
  readonly sharedDoc?: boolean
  /** Hyperlinks changed flag */
  readonly hyperlinksChanged?: boolean
  /** Company name */
  readonly company?: string
  /** Worksheet names */
  readonly worksheetNames?: Array<string>
}

/**
 * XML extraction result.
 *
 * Contains extracted text content from XML files along with
 * structural statistics about the XML document.
 */
export interface XmlExtractionResult {
  /** Extracted text content (XML structure filtered out) */
  readonly content: string
  /** Total number of XML elements processed */
  readonly elementCount: number
  /** List of unique element names found (sorted) */
  readonly uniqueElements: Array<string>
}

/**
 * XML metadata extracted during XML parsing.
 *
 * Provides statistics about XML document structure.
 */
export interface XmlMetadata {
  /** Total number of XML elements processed */
  readonly elementCount?: number
  /** List of unique element tag names (sorted) */
  readonly uniqueElements?: Array<string>
}

/** YAKE-specific parameters. */
export interface YakeParams {
  /**
   * Window size for co-occurrence analysis (default: 2).
   *
   * Controls the context window for computing co-occurrence statistics.
   */
  readonly windowSize?: number
}

/** Year range for bibliographic metadata. */
export interface YearRange {
  readonly min?: number
  readonly max?: number
  readonly years: Array<number>
}

/** List names of all registered document extractors. */
export declare function listDocumentExtractors(): Array<string>;

/**
 * List the names of all registered embedding backends.
 *
 * Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
 * bindings.
 */
export declare function listEmbeddingBackends(): Array<string>;

/**
 * List the names of all available embedding presets.
 *
 * Returns owned `String`s so the values are safe to pass across FFI boundaries.
 */
export declare function listEmbeddingPresets(): Array<string>;

/**
 * List all registered OCR backends.
 *
 * Returns the names of all OCR backends currently registered in the global registry.
 * @returns A vector of OCR backend names.
 */
export declare function listOcrBackends(): Array<string>;

/**
 * List all registered post-processor names.
 *
 * Returns a vector of all post-processor names currently registered in the
 * global registry.
 * @returns - `Ok(Vec<String>)` - Vector of post-processor names
 * - `Err(...)` if the registry lock is poisoned
 */
export declare function listPostProcessors(): Array<string>;

/**
 * List names of all registered renderers.
 * @throws Returns an error if the registry lock is poisoned.
 */
export declare function listRenderers(): Array<string>;

/** List names of all registered validators. */
export declare function listValidators(): Array<string>;

export declare function registerDocumentExtractor(impl: DocumentExtractor): void;

export declare function registerEmbeddingBackend(impl: EmbeddingBackend): void;

export declare function registerOcrBackend(impl: OcrBackend): void;

export declare function registerPostProcessor(impl: PostProcessor): void;

export declare function registerRenderer(impl: Renderer): void;

export declare function registerValidator(impl: Validator): void;

/**
 * Render a single PDF page to PNG bytes.
 *
 * Returns raw PNG-encoded bytes for the specified page at the given DPI.
 * Uses pdf_oxide with tiny-skia for pure-Rust rendering.
 * @param pdf_bytes - Raw PDF file bytes
 *
 * @param page_index - Zero-based page index
 *
 * @param dpi - Resolution in dots per inch (default: 150)
 *
 * @param password - Optional password for encrypted PDFs
 *
 * @throws Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
 * or rendered, or if `page_index` is out of range.
 */
export declare function renderPdfPageToPng(pdfBytes: Uint8Array, pageIndex: number, dpi?: number | undefined | null, password?: string | undefined | null): Uint8Array;

export declare function unregisterDocumentExtractor(name: string): void;

export declare function unregisterEmbeddingBackend(name: string): void;

export declare function unregisterOcrBackend(name: string): void;

export declare function unregisterPostProcessor(name: string): void;

export declare function unregisterRenderer(name: string): void;

export declare function unregisterValidator(name: string): void;