Files
fil/crates/kreuzberg-node/index.d.ts
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

5489 lines
189 KiB
TypeScript
Generated
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
/* eslint-disable */
export type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue };
/**
* Extract content from multiple byte arrays concurrently.
*
* This function processes multiple byte arrays in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
* to `(num_cpus * 1.5).ceil()`.
*
* Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
* fields from the batch-level `config`. Pass `None` as the config to use
* the batch-level defaults for that item.
* @param items - Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
*
* @param config - Batch-level extraction configuration
*
* @returns A vector of `ExtractionResult` in the same order as the input items.
*/
export declare function batchExtractBytes(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
/**
* Synchronous wrapper for `batch_extract_bytes`.
*
* Uses the global Tokio runtime for optimal performance.
* With the `tokio-runtime` feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
* that iterates through items and calls `extract_bytes_sync()`.
*/
export declare function batchExtractBytesSync(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
/**
* Extract content from multiple files concurrently.
*
* This function processes multiple files in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
* to `(num_cpus * 1.5).ceil()`.
*
* Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
* fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
* Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
* taken from the batch-level `config`.
* @param items - Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
*
* @param config - Batch-level extraction configuration (provides defaults and batch settings)
*
* @returns A vector of `ExtractionResult` in the same order as the input items.
*
* @throws Individual file errors are captured in the result metadata. System errors
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
*/
export declare function batchExtractFiles(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
/**
* Synchronous wrapper for `batch_extract_files`.
*
* Uses the global Tokio runtime for optimal performance.
* Only available with `tokio-runtime` (WASM has no filesystem).
*/
export declare function batchExtractFilesSync(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
/**
* Clear all document extractors from the global registry.
*
* Calls `shutdown()` on every registered extractor, then empties the registry.
* @throws - Any error returned by an extractor's `shutdown()` method. The first error
* encountered stops processing of remaining extractors.
*/
export declare function clearDocumentExtractors(): void;
/**
* Clear all embedding backends from the global registry.
*
* Calls `shutdown()` on every registered backend, then empties the registry.
* @throws - Any error returned by a backend's `shutdown()` method. The first error
* encountered stops processing of remaining backends.
*/
export declare function clearEmbeddingBackends(): void;
/**
* Clear all OCR backends from the global registry.
*
* Removes all OCR backends and calls their `shutdown()` methods.
* @returns - `Ok(())` if all backends were cleared successfully
* - `Err(...)` if any shutdown method failed
*/
export declare function clearOcrBackends(): void;
/** Remove all registered post-processors. */
export declare function clearPostProcessors(): void;
/**
* Clear all renderers from the global registry.
*
* Removes every renderer, including the built-in defaults (markdown, html,
* djot, plain). After calling this no renderers are registered; re-register
* as needed.
* @throws Returns an error if the registry lock is poisoned.
*/
export declare function clearRenderers(): void;
/** Remove all registered validators. */
export declare function clearValidators(): void;
/**
* Compare two extraction results and return a structured diff.
*
* The comparison is purely structural — no I/O, no side effects. All fields
* of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
* @param a - — the "before" extraction result
*
* @param b - — the "after" extraction result
*
* @param opts - — controls which sections are compared and optional truncation
*/
export declare function compare(a?: ExtractionResult | undefined | null, b?: ExtractionResult | undefined | null, opts?: DiffOptions | undefined | null): ExtractionDiff;
/**
* Detect the MIME type of a file at the given path.
*
* Uses the file extension and optionally the file content to determine the MIME type.
* Set `check_exists` to `true` to verify the file exists before detection.
*/
export declare function detectMimeType(path: string, checkExists: boolean): string;
/**
* Detect MIME type from raw file bytes.
*
* Uses magic byte signatures to detect file type from content.
* Falls back to `infer` crate for comprehensive detection.
*
* For ZIP-based files, inspects contents to distinguish Office Open XML
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
* @param content - Raw file bytes
*
* @returns The detected MIME type string.
*
* @throws Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
*/
export declare function detectMimeTypeFromBytes(content: Uint8Array): string;
/**
* Embed a list of texts using the configured embedding model.
*
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
*/
export declare function embedTexts(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Array<Array<number>>;
/**
* Generate embeddings asynchronously for a list of text strings.
*
* This is the async counterpart to [`embed_texts`]. It offloads the blocking
* ONNX inference work to a dedicated blocking thread pool via Tokio's
* `spawn_blocking`, keeping the async executor free.
*
* Returns one embedding vector per input text in the same order.
* @param texts - Vec of strings to embed (owned, sent to blocking thread)
*
* @param config - Embedding configuration specifying model, batch size, and normalization
*
* @throws - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
* - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
* or the blocking inference task panics
*/
export declare function embedTextsAsync(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Promise<Array<Array<number>>>;
/**
* Extract content from a byte array.
*
* This is the main entry point for in-memory extraction. It performs the following steps:
* 1. Validate MIME type
* 2. Handle legacy format conversion if needed
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* @param content - The byte array to extract
*
* @param mime_type - MIME type of the content
*
* @param config - Extraction configuration
*
* @returns An `ExtractionResult` containing the extracted content and metadata.
*
* @throws Returns `KreuzbergError::Validation` if MIME type is invalid.
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
*/
export declare function extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
/**
* Synchronous wrapper for `extract_bytes`.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call.
*
* With the `tokio-runtime` feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
*/
export declare function extractBytesSync(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): ExtractionResult;
/**
* Extract content from a file.
*
* This is the main entry point for file-based extraction. It performs the following steps:
* 1. Check cache for existing result (if caching enabled)
* 2. Detect or validate MIME type
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* 6. Store result in cache (if caching enabled)
* @param path - Path to the file to extract
*
* @param mime_type - Optional MIME type override. If None, will be auto-detected
*
* @param config - Extraction configuration
*
* @returns An `ExtractionResult` containing the extracted content and metadata.
*
* @throws Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
*/
export declare function extractFile(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
/**
* Synchronous wrapper for `extract_file`.
*
* This is a convenience function that blocks the current thread until extraction completes.
* For async code, use `extract_file` directly.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
*
* This function is only available with the `tokio-runtime` feature. For WASM targets,
* use a truly synchronous extraction approach instead.
*/
export declare function extractFileSync(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): ExtractionResult;
/**
* Get an embedding preset by name.
*
* Returns `None` if no preset with the given name exists. Returns an owned
* clone so the value is safe to pass across FFI boundaries.
*/
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
/**
* Get file extensions for a given MIME type.
*
* Returns all known file extensions that map to the specified MIME type.
* @param mime_type - The MIME type to look up
*
* @returns A vector of file extensions (without leading dot) for the MIME type.
*/
export declare function getExtensionsForMime(mimeType: string): Array<string>;
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*/
export interface AccelerationConfig {
/** Execution provider to use for ONNX inference. */
readonly provider?: ExecutionProviderType
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
readonly deviceId?: number
}
/** Types of inline text annotations. */
export type AnnotationKind =
| { annotation_type: 'bold' }
| { annotation_type: 'italic' }
| { annotation_type: 'underline' }
| { annotation_type: 'strikethrough' }
| { annotation_type: 'code' }
| { annotation_type: 'subscript' }
| { annotation_type: 'superscript' }
| { annotation_type: 'link'; url: string; title: string }
| { annotation_type: 'highlight' }
| { annotation_type: 'color'; value: string }
| { annotation_type: 'font_size'; value: string }
| { annotation_type: 'custom'; name: string; value: string }
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
export interface ArchiveEntry {
/** Archive-relative file path (e.g. "folder/document.pdf"). */
readonly path: string
/** Detected MIME type of the file. */
readonly mimeType: string
/** Full extraction result for this file. */
readonly result: ExtractionResult
}
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
export interface ArchiveMetadata {
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
readonly format?: string
/** Total number of files in the archive */
readonly fileCount?: number
/** List of file paths within the archive */
readonly fileList?: Array<string>
/** Total uncompressed size in bytes */
readonly totalSize?: number
/** Compressed size in bytes (if available) */
readonly compressedSize?: number
}
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
export interface BatchBytesItem {
/** The content bytes to extract from */
readonly content: Uint8Array
/** MIME type of the content (e.g., "application/pdf", "text/html") */
readonly mimeType: string
/** Per-item configuration overrides (None uses batch-level defaults) */
readonly config?: FileExtractionConfig
}
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
export interface BatchFileItem {
/** Path to the file to extract from */
readonly path: string
/** Per-file configuration overrides (None uses batch-level defaults) */
readonly config?: FileExtractionConfig
}
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
export interface BBox {
readonly x1: number
readonly y1: number
readonly x2: number
readonly y2: number
}
/** BibTeX bibliography metadata. */
export interface BibtexMetadata {
/** Number of entries in the bibliography. */
readonly entryCount?: number
readonly citationKeys?: Array<string>
readonly authors?: Array<string>
readonly yearRange?: YearRange
readonly entryTypes?: Record<string, number>
}
/** Types of block-level elements in Djot. */
export declare enum BlockType {
Paragraph = "paragraph",
Heading = "heading",
Blockquote = "blockquote",
CodeBlock = "code_block",
ListItem = "list_item",
OrderedList = "ordered_list",
BulletList = "bullet_list",
TaskList = "task_list",
DefinitionList = "definition_list",
DefinitionTerm = "definition_term",
DefinitionDescription = "definition_description",
Div = "div",
Section = "section",
ThematicBreak = "thematic_break",
RawBlock = "raw_block",
MathDisplay = "math_display",
}
/** Bounding box coordinates for element positioning. */
export interface BoundingBox {
/** Left x-coordinate */
readonly x0?: number
/** Bottom y-coordinate */
readonly y0?: number
/** Right x-coordinate */
readonly x1?: number
/** Top y-coordinate */
readonly y1?: number
}
export interface CacheStats {
readonly totalFiles: number
readonly totalSizeMb: number
readonly availableSpaceMb: number
readonly oldestFileAgeDays: number
readonly newestFileAgeDays: number
}
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
export interface CellChange {
/** Zero-based row index. */
readonly row: number
/** Zero-based column index. */
readonly col: number
/** Value before the change. */
readonly from: string
/** Value after the change. */
readonly to: string
}
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
export interface Chunk {
/** The text content of this chunk. */
readonly content: string
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
*/
readonly chunkType: ChunkType
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*/
readonly embedding?: Array<number>
/** Metadata about this chunk's position and properties. */
readonly metadata: ChunkMetadata
}
/**
* Type of text chunker to use.
*
* # Variants
*
* * `Text` - Generic text splitter, splits on whitespace and punctuation
* * `Markdown` - Markdown-aware splitter, preserves formatting and structure
* * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* `max_characters` (default 1000). `topic_threshold` has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
export declare enum ChunkerType {
Text = "text",
Markdown = "markdown",
Yaml = "yaml",
Semantic = "semantic",
}
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..Default::default()` when constructing to allow for future field additions:
* ```rust
* let config = ChunkingConfig {
* max_characters: 500,
* ..Default::default()
* };
* ```
*/
export interface ChunkingConfig {
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
readonly maxCharacters?: number
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*/
readonly overlap?: number
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
readonly trim?: boolean
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
readonly chunkerType?: ChunkerType
/** Optional embedding configuration for chunk embeddings. */
readonly embedding?: EmbeddingConfig
/** Use a preset configuration (overrides individual settings if provided). */
readonly preset?: string
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*/
readonly sizing?: ChunkSizing
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*/
readonly prependHeadingContext?: boolean
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*/
readonly topicThreshold?: number
}
/** Metadata about a chunk's position in the original document. */
export interface ChunkMetadata {
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
readonly byteStart: number
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
readonly byteEnd: number
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*/
readonly tokenCount?: number
/** Zero-based index of this chunk in the document. */
readonly chunkIndex: number
/** Total number of chunks in the document. */
readonly totalChunks: number
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
readonly firstPage?: number
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
readonly lastPage?: number
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType::Markdown` is used.
*/
readonly headingContext?: HeadingContext
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*/
readonly imageIndices: Array<number>
}
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
export type ChunkSizing =
| { type: 'characters' }
| { type: 'tokenizer'; model: string; cacheDir: string }
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in `chunking::classifier`.
* Defaults to `Unknown` when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
export declare enum ChunkType {
/** Section heading or document title. */
Heading = "heading",
/** Party list: names, addresses, and signatories. */
PartyList = "party_list",
/** Definition clause ("X means…", "X shall mean…"). */
Definitions = "definitions",
/** Operative clause containing legal/contractual action verbs. */
OperativeClause = "operative_clause",
/** Signature block with signatures, names, and dates. */
SignatureBlock = "signature_block",
/** Schedule, annex, appendix, or exhibit section. */
Schedule = "schedule",
/** Table-like content with aligned columns or repeated patterns. */
TableLike = "table_like",
/** Mathematical formula or equation. */
Formula = "formula",
/** Code block or preformatted content. */
CodeBlock = "code_block",
/** Embedded or referenced image content. */
Image = "image",
/** Organizational chart or hierarchy diagram. */
OrgChart = "org_chart",
/** Diagram, figure, or visual illustration. */
Diagram = "diagram",
/** Unclassified or mixed content. */
Unknown = "unknown",
}
/** Citation file metadata (RIS, PubMed, EndNote). */
export interface CitationMetadata {
readonly citationCount?: number
readonly format?: string
readonly authors?: Array<string>
readonly yearRange?: YearRange
readonly dois?: Array<string>
readonly keywords?: Array<string>
}
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the `content` field
* of `ExtractionResult`.
*/
export declare enum CodeContentMode {
/** Use TSLP semantic chunks as content (default). */
Chunks = "chunks",
/** Use raw source code as content. */
Raw = "raw",
/** Emit function/class headings + docstrings (no code bodies). */
Structure = "structure",
}
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `None` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
export interface ContentFilterConfig {
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*/
readonly includeHeaders?: boolean
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*/
readonly includeFooters?: boolean
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*/
readonly stripRepeatingText?: boolean
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*/
readonly includeWatermarks?: boolean
}
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
export declare enum ContentLayer {
/** Main document body content. */
Body = "body",
/** Page/section header (running header). */
Header = "header",
/** Page/section footer (running footer). */
Footer = "footer",
/** Footnote content. */
Footnote = "footnote",
}
/** JATS contributor with role. */
export interface ContributorRole {
readonly name: string
readonly role?: string
}
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
export interface CoreProperties {
/** Document title */
readonly title?: string
/** Document subject/topic */
readonly subject?: string
/** Document creator/author */
readonly creator?: string
/** Keywords or tags */
readonly keywords?: string
/** Document description/abstract */
readonly description?: string
/** User who last modified the document */
readonly lastModifiedBy?: string
/** Revision number */
readonly revision?: string
/** Creation timestamp (ISO 8601) */
readonly created?: string
/** Last modification timestamp (ISO 8601) */
readonly modified?: string
/** Document category */
readonly category?: string
/** Content status (Draft, Final, etc.) */
readonly contentStatus?: string
/** Document language */
readonly language?: string
/** Unique identifier */
readonly identifier?: string
/** Document version */
readonly version?: string
/** Last print timestamp (ISO 8601) */
readonly lastPrinted?: string
}
/** CSV/TSV file metadata. */
export interface CsvMetadata {
readonly rowCount?: number
readonly columnCount?: number
readonly delimiter?: string
readonly hasHeader?: boolean
readonly columnTypes?: Array<string>
}
/** dBASE field information. */
export interface DbfFieldInfo {
readonly name: string
readonly fieldType: string
}
/** dBASE (DBF) file metadata. */
export interface DbfMetadata {
readonly recordCount?: number
readonly fieldCount?: number
readonly fields?: Array<DbfFieldInfo>
}
/** Page-level detection result containing all detections and page metadata. */
export interface DetectionResult {
readonly pageWidth: number
readonly pageHeight: number
readonly detections: Array<LayoutDetection>
}
/** MIME type detection response. */
export interface DetectResponse {
/** Detected MIME type */
readonly mimeType: string
/** Original filename (if provided) */
readonly filename?: string
}
/** A single contiguous hunk in a unified diff. */
export interface DiffHunk {
/** Starting line number in the old content (0-indexed). */
readonly fromLine: number
/** Number of lines from the old content in this hunk. */
readonly fromCount: number
/** Starting line number in the new content (0-indexed). */
readonly toLine: number
/** Number of lines from the new content in this hunk. */
readonly toCount: number
/** Lines that make up this hunk. */
readonly lines: Array<DiffLine>
}
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
export type DiffLine =
| { kind: 'context'; 0: string }
| { kind: 'added'; 0: string }
| { kind: 'removed'; 0: string }
/** Options controlling how two `ExtractionResult` values are compared. */
export interface DiffOptions {
/** Include metadata changes in the diff. Default: `true`. */
readonly includeMetadata?: boolean
/** Include embedded-children changes in the diff. Default: `true`. */
readonly includeEmbedded?: boolean
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `None` means no truncation.
*/
readonly maxContentChars?: number
}
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
export interface DjotContent {
/** Plain text representation for backwards compatibility */
readonly plainText: string
/** Structured block-level content */
readonly blocks: Array<FormattedBlock>
/** Metadata from YAML frontmatter */
readonly metadata: Metadata
/** Extracted tables as structured data */
readonly tables: Array<Table>
/** Extracted images with metadata */
readonly images: Array<DjotImage>
/** Extracted links with URLs */
readonly links: Array<DjotLink>
/** Footnote definitions */
readonly footnotes: Array<Footnote>
/** Attributes mapped by element identifier (if present) */
readonly attributes: Array<string>
}
/** Image element in Djot. */
export interface DjotImage {
/** Image source URL or path */
readonly src: string
/** Alternative text */
readonly alt: string
/** Optional title */
readonly title?: string
/** Element attributes */
readonly attributes?: string
}
/** Link element in Djot. */
export interface DjotLink {
/** Link URL */
readonly url: string
/** Link text content */
readonly text: string
/** Optional title */
readonly title?: string
/** Element attributes */
readonly attributes?: string
}
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
*/
export interface DocumentExtractor {
name(): string
/**
* Extract content from a byte array.
*
* This is the core extraction method that processes in-memory document data.
* @param content - Raw document bytes
*
* @param mime_type - MIME type of the document (already validated)
*
* @param config - Extraction configuration
*
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
* The pipeline will convert this into the public `ExtractionResult`.
*
* @throws - `KreuzbergError::Parsing` - Document parsing failed
* - `KreuzbergError::Validation` - Invalid document structure
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
* - `KreuzbergError::MissingDependency` - Required dependency not available
*/
extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
/**
* Extract content from a file.
*
* Default implementation reads the file and calls `extract_bytes`.
* Override for custom file handling, streaming, or memory optimizations.
* @param path - Path to the document file
*
* @param mime_type - MIME type of the document (already validated)
*
* @param config - Extraction configuration
*
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
*
* @throws Same as `extract_bytes`, plus file I/O errors.
*/
extractFile?(path: string, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
/**
* Get the list of MIME types supported by this extractor.
*
* Can include exact MIME types and prefix patterns:
* - Exact: `"application/pdf"`, `"text/plain"`
* - Prefix: `"image/*"` (matches any image type)
* @returns A slice of MIME type strings.
*/
supportedMimeTypes(): string
/**
* Get the priority of this extractor.
*
* Higher priority extractors are preferred when multiple extractors
* support the same MIME type.
*
* # Priority Guidelines
*
* - **0-25**: Fallback/low-quality extractors
* - **26-49**: Alternative extractors
* - **50**: Default priority (built-in extractors)
* - **51-75**: Premium/enhanced extractors
* - **76-100**: Specialized/high-priority extractors
* @returns Priority value (default: 50)
*/
priority?(): string
/**
* Optional: Check if this extractor can handle a specific file.
*
* Allows for more sophisticated detection beyond MIME types.
* Defaults to `true` (rely on MIME type matching).
* @param path - Path to the file to check
*
* @param mime_type - Detected MIME type
*
* @returns `true` if the extractor can handle this file, `false` otherwise.
*/
canHandle?(path: string, mimeType: string): string
}
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
export interface DocumentNode {
/** Deterministic identifier (hash of content + position). */
readonly id: string
/** Node content — tagged enum, type-specific data only. */
readonly content: NodeContent
/** Parent node index (`None` = root-level node). */
readonly parent?: number
/** Child node indices in reading order. */
readonly children: Array<number>
/** Content layer classification. */
readonly contentLayer: ContentLayer
/** Page number where this node starts (1-indexed). */
readonly page?: number
/** Page number where this node ends (for multi-page tables/sections). */
readonly pageEnd?: number
/** Bounding box in document coordinates. */
readonly bbox?: BoundingBox
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*/
readonly annotations: Array<TextAnnotation>
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*/
readonly attributes?: Record<string, string>
}
/** A resolved relationship between two nodes in the document tree. */
export interface DocumentRelationship {
/** Source node index (the referencing node). */
readonly source: number
/** Target node index (the referenced node). */
readonly target: number
/** Semantic kind of the relationship. */
readonly kind: RelationshipKind
}
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
export interface DocumentRevision {
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
*/
readonly revisionId: string
/** Display name of the author who made this change, when available. */
readonly author?: string
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*/
readonly timestamp?: string
/** Semantic kind of this revision. */
readonly kind: RevisionKind
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `None` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*/
readonly anchor?: RevisionAnchor
/** The content changes that make up this revision. */
readonly delta: RevisionDelta
}
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
export interface DocumentStructure {
/** All nodes in document/reading order. */
readonly nodes?: Array<DocumentNode>
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*/
readonly sourceFormat?: string
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*/
readonly relationships?: Array<DocumentRelationship>
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
*
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*/
readonly nodeTypes?: Array<string>
}
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
export interface DocxAppProperties {
/** Application name (e.g., "Microsoft Office Word") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Template filename */
readonly template?: string
/** Total editing time in minutes */
readonly totalTime?: number
/** Number of pages */
readonly pages?: number
/** Number of words */
readonly words?: number
/** Number of characters (excluding spaces) */
readonly characters?: number
/** Number of characters (including spaces) */
readonly charactersWithSpaces?: number
/** Number of lines */
readonly lines?: number
/** Number of paragraphs */
readonly paragraphs?: number
/** Company name */
readonly company?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
}
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
export interface DocxMetadata {
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*/
readonly coreProperties?: CoreProperties
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*/
readonly appProperties?: DocxAppProperties
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*/
readonly customProperties?: Record<string, JsonValue>
}
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
export interface Element {
/** Unique element identifier */
readonly elementId: string
/** Semantic type of this element */
readonly elementType: ElementType
/** Text content of the element */
readonly text: string
/** Metadata about the element */
readonly metadata: ElementMetadata
}
/** Metadata for a semantic element. */
export interface ElementMetadata {
/** Page number (1-indexed) */
readonly pageNumber?: number
/** Source filename or document name */
readonly filename?: string
/** Bounding box coordinates if available */
readonly coordinates?: BoundingBox
/** Position index in the element sequence */
readonly elementIndex?: number
/** Additional custom metadata */
readonly additional: Record<string, string>
}
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
export declare enum ElementType {
/** Document title */
Title = "title",
/** Main narrative text body */
NarrativeText = "narrative_text",
/** Section heading */
Heading = "heading",
/** List item (bullet, numbered, etc.) */
ListItem = "list_item",
/** Table element */
Table = "table",
/** Image element */
Image = "image",
/** Page break marker */
PageBreak = "page_break",
/** Code block */
CodeBlock = "code_block",
/** Block quote */
BlockQuote = "block_quote",
/** Footer text */
Footer = "footer",
/** Header text */
Header = "header",
}
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
export interface EmailAttachment {
/** Attachment name (from Content-Disposition header) */
readonly name?: string
/** Filename of the attachment */
readonly filename?: string
/** MIME type of the attachment */
readonly mimeType?: string
/** Size in bytes */
readonly size?: number
/** Whether this attachment is an image */
readonly isImage: boolean
/**
* Attachment data (if extracted).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*/
readonly data?: Uint8Array
}
/** Configuration for email extraction. */
export interface EmailConfig {
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `None`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 — the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*/
readonly msgFallbackCodepage?: number
}
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
export interface EmailExtractionResult {
/** Email subject line */
readonly subject?: string
/** Sender email address */
readonly fromEmail?: string
/** Primary recipient email addresses */
readonly toEmails: Array<string>
/** CC recipient email addresses */
readonly ccEmails: Array<string>
/** BCC recipient email addresses */
readonly bccEmails: Array<string>
/** Email date/timestamp */
readonly date?: string
/** Message-ID header value */
readonly messageId?: string
/** Plain text version of the email body */
readonly plainText?: string
/** HTML version of the email body */
readonly htmlContent?: string
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
readonly content: string
/** List of email attachments */
readonly attachments: Array<EmailAttachment>
/** Additional email headers and metadata */
readonly metadata: Record<string, string>
}
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
export interface EmailMetadata {
/** Sender's email address */
readonly fromEmail?: string
/** Sender's display name */
readonly fromName?: string
/** Primary recipients */
readonly toEmails?: Array<string>
/** CC recipients */
readonly ccEmails?: Array<string>
/** BCC recipients */
readonly bccEmails?: Array<string>
/** Message-ID header value */
readonly messageId?: string
/** List of attachment filenames */
readonly attachments?: Array<string>
}
/** Changes to embedded archive children between two results. */
export interface EmbeddedChanges {
/** Children present in `b` but not in `a` (matched by `path`). */
readonly added: Array<ArchiveEntry>
/** Children present in `a` but not in `b` (matched by `path`). */
readonly removed: Array<ArchiveEntry>
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*/
readonly changed: Array<EmbeddedDiff>
}
/** Diff for a single embedded archive entry that appears in both results. */
export interface EmbeddedDiff {
/** Archive-relative path identifying this entry. */
readonly path: string
/** The recursive diff of the entry's extraction result. */
readonly diff: ExtractionDiff
}
/** Embedded file descriptor extracted from the PDF name tree. */
export interface EmbeddedFile {
/** The filename as stored in the PDF name tree. */
readonly name: string
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
readonly data: Uint8Array
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*/
readonly compressedSize: number
/** MIME type if specified in the filespec, otherwise `None`. */
readonly mimeType?: string
}
/**
* Trait for in-process embedding backend plugins.
*
* Async to match the convention used by `OcrBackend`,
* `DocumentExtractor`, and `PostProcessor`.
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
* equivalent to satisfy the async signature.
*
* # Thread safety
*
* Backends must be `Send + Sync + 'static`. They are stored in
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
* pipeline. If the backend's underlying model isn't thread-safe, the backend
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
*
* # Contract
*
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
* `self.dimensions()`. The dispatcher in `embed_texts`
* validates this before returning to downstream consumers; a non-conforming
* backend surfaces as a `KreuzbergError::Validation`, not a panic.
* - `embed` may be called from any thread. Its future must be `Send`
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
* - `dimensions()` is called exactly once at registration, immediately after
* `initialize()` succeeds. The returned value is cached by the registry and
* used for all subsequent shape validation. Lazy-loading implementations can
* defer model loading into `initialize()` and report the real dimension
* afterwards. Later mutations of the backend's reported dimension are not
* observed by kreuzberg — implementations that need to change dimension
* must unregister and re-register.
* - `shutdown()` (inherited from `Plugin`) may be invoked
* concurrently with an in-flight `embed()` call. Implementations must
* tolerate this — e.g. by letting in-flight calls finish using resources
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
* shared state that isn't needed by `embed`.
*
* # Runtime
*
* The synchronous `embed_texts` entry uses
* [`tokio::task::block_in_place`] to await the trait's async `embed`, which
* requires a multi-thread tokio runtime. Callers running inside a
* `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
* or `tokio::runtime::Builder::new_current_thread()`) must use
* `embed_texts_async` instead, which awaits directly without
* `block_in_place`.
*/
export interface EmbeddingBackend {
name(): string
/**
* Embedding vector dimension. Must be `> 0` and must match the length of
* every vector returned by `embed`.
*/
dimensions(): string
/**
* Embed a batch of texts, returning one vector per input in order.
* @throws Implementations should return `Plugin` for
* backend-specific failures. The dispatcher layers its own validation
* (length, per-vector dimension) on top.
*/
embed(texts: Array<string>): Promise<string>
}
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
export interface EmbeddingConfig {
/** The embedding model to use (defaults to "balanced" preset if not specified) */
readonly model?: EmbeddingModelType
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
readonly normalize?: boolean
/** Batch size for embedding generation */
readonly batchSize?: number
/** Show model download progress */
readonly showDownloadProgress?: boolean
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*/
readonly cacheDir?: string
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*/
readonly acceleration?: AccelerationConfig
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using [`EmbeddingModelType::Plugin`].
*
* Applies only to the in-process plugin path — protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `None` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*/
readonly maxEmbedDurationSecs?: number
}
/** Embedding model types supported by Kreuzberg. */
export type EmbeddingModelType =
| { type: 'preset'; name: string }
| { type: 'custom'; modelId: string; dimensions: number }
| { type: 'llm'; llm: LlmConfig }
| { type: 'plugin'; name: string }
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
export interface EmbeddingPreset {
readonly name: string
readonly chunkSize: number
readonly overlap: number
/** HuggingFace repository name for the model. */
readonly modelRepo: string
/** Pooling strategy: "cls" or "mean". */
readonly pooling: string
/** Path to the ONNX model file within the repo. */
readonly modelFile: string
readonly dimensions: number
readonly description: string
}
/** EPUB metadata (Dublin Core extensions). */
export interface EpubMetadata {
readonly coverage?: string
readonly dcFormat?: string
readonly relation?: string
readonly source?: string
readonly dcType?: string
readonly coverImage?: string
}
/** Error metadata (for batch operations). */
export interface ErrorMetadata {
readonly errorType: string
readonly message: string
}
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
export interface ExcelMetadata {
/** Number of sheets in the workbook. */
readonly sheetCount?: number
/** Names of all sheets in the workbook. */
readonly sheetNames?: Array<string>
}
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
export interface ExcelSheet {
/** Sheet name as it appears in Excel */
readonly name: string
/** Sheet content converted to Markdown tables */
readonly markdown: string
/** Number of rows */
readonly rowCount: number
/** Number of columns */
readonly colCount: number
/** Total number of non-empty cells */
readonly cellCount: number
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*/
readonly tableCells?: Array<Array<string>>
}
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
export interface ExcelWorkbook {
/** All sheets in the workbook */
readonly sheets: Array<ExcelSheet>
/** Workbook-level metadata (author, creation date, etc.) */
readonly metadata: Record<string, string>
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
*/
readonly revisions?: Array<DocumentRevision>
}
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* `Auto` (default) selects the best available provider per platform.
*/
export declare enum ExecutionProviderType {
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
Auto = "auto",
/** CPU execution provider (always available). */
Cpu = "cpu",
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
CoreMl = "coreml",
/** NVIDIA CUDA GPU acceleration. */
Cuda = "cuda",
/** NVIDIA TensorRT (optimized CUDA inference). */
TensorRt = "tensorrt",
}
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
export interface ExtractedImage {
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*/
readonly data: Uint8Array
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*/
readonly format: string
/** Zero-indexed position of this image in the document/page */
readonly imageIndex: number
/** Page/slide number where image was found (1-indexed) */
readonly pageNumber?: number
/** Image width in pixels */
readonly width?: number
/** Image height in pixels */
readonly height?: number
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
readonly colorspace?: string
/** Bits per color component (e.g., 8, 16) */
readonly bitsPerComponent?: number
/** Whether this image is a mask image */
readonly isMask: boolean
/** Optional description of the image */
readonly description?: string
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*/
readonly ocrResult?: ExtractionResult
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*/
readonly boundingBox?: BoundingBox
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*/
readonly sourcePath?: string
/**
* Heuristic classification of what this image likely depicts.
* `None` if classification was disabled or inconclusive.
*/
readonly imageKind?: ImageKind
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
readonly kindConfidence?: number
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
*/
readonly clusterId?: number
}
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
export interface ExtractedUri {
/** The URL or path string. */
readonly url: string
/** Optional display text / label for the link. */
readonly label?: string
/** Optional page number where the URI was found (1-indexed). */
readonly page?: number
/** Semantic classification of the URI. */
readonly kind: UriKind
}
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*/
export interface ExtractionConfig {
/** Enable caching of extraction results */
readonly useCache?: boolean
/** Enable quality post-processing */
readonly enableQualityProcessing?: boolean
/** OCR configuration (None = OCR disabled) */
readonly ocr?: OcrConfig
/** Force OCR even for searchable PDFs */
readonly forceOcr?: boolean
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*/
readonly forceOcrPages?: Array<number>
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*/
readonly disableOcr?: boolean
/** Text chunking configuration (None = chunking disabled) */
readonly chunking?: ChunkingConfig
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See [`ContentFilterConfig`] for per-field documentation.
*/
readonly contentFilter?: ContentFilterConfig
/** Image extraction configuration (None = no image extraction) */
readonly images?: ImageExtractionConfig
/** PDF-specific options (None = use defaults) */
readonly pdfOptions?: PdfConfig
/** Token reduction configuration (None = no token reduction) */
readonly tokenReduction?: TokenReductionOptions
/** Language detection configuration (None = no language detection) */
readonly languageDetection?: LanguageDetectionConfig
/** Page extraction configuration (None = no page tracking) */
readonly pages?: PageConfig
/** Keyword extraction configuration (None = no keyword extraction) */
readonly keywords?: KeywordConfig
/** Post-processor configuration (None = use defaults) */
readonly postprocessor?: PostProcessorConfig
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*/
readonly htmlOptions?: string
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat::Html`, the extraction
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `None`, the existing plain comrak-based HTML renderer is used.
*/
readonly htmlOutput?: HtmlOutputConfig
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by [`FileExtractionConfig::timeout_secs`].
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `None` to
* disable the timeout for trusted input or long-running workloads.
*/
readonly extractionTimeoutSecs?: number
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*/
readonly maxConcurrentExtractions?: number
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*/
readonly resultFormat?: ResultFormat
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `None`, default limits are used.
*/
readonly securityLimits?: SecurityLimits
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `None` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*/
readonly maxEmbeddedFileBytes?: number
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*/
readonly outputFormat?: OutputFormat
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*/
readonly layout?: LayoutDetectionConfig
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*/
readonly useLayoutForMarkdown?: boolean
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
readonly includeDocumentStructure?: boolean
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*/
readonly acceleration?: AccelerationConfig
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*/
readonly cacheNamespace?: string
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `None`, the global TTL applies.
*/
readonly cacheTtlSecs?: number
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*/
readonly email?: EmailConfig
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*/
readonly concurrency?: string
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*/
readonly maxArchiveDepth?: number
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*/
readonly treeSitter?: TreeSitterConfig
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult::structured_output`.
*/
readonly structuredExtraction?: StructuredExtractionConfig
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*/
readonly cancelToken?: string
}
/** The complete diff between two `ExtractionResult` values. */
export interface ExtractionDiff {
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*/
readonly contentDiff: Array<DiffHunk>
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
readonly tablesAdded: Array<Table>
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
readonly tablesRemoved: Array<Table>
/** Cell-level changes for table pairs that share the same index and dimensions. */
readonly tablesChanged: Array<TableDiff>
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ — each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*/
readonly metadataChanged: JsonValue
/** Changes to embedded archive children. */
readonly embeddedChanges: EmbeddedChanges
}
/** How the extracted text was produced. */
export declare enum ExtractionMethod {
Native = "native",
Ocr = "ocr",
Mixed = "mixed",
}
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
export interface ExtractionResult {
readonly content?: string
readonly mimeType?: string
readonly metadata?: Metadata
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
readonly extractionMethod?: ExtractionMethod
readonly tables?: Array<Table>
readonly detectedLanguages?: Array<string>
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
readonly chunks?: Array<Chunk>
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*/
readonly images?: Array<ExtractedImage>
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
readonly pages?: Array<PageContent>
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
readonly elements?: Array<Element>
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `None` for non-Djot documents.
*/
readonly djotContent?: DjotContent
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*/
readonly ocrElements?: Array<OcrElement>
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*/
readonly document?: DocumentStructure
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*/
readonly extractedKeywords?: Array<Keyword>
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*/
readonly qualityScore?: number
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*/
readonly processingWarnings?: Array<ProcessingWarning>
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
readonly annotations?: Array<PdfAnnotation>
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `None` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*/
readonly children?: Array<ArchiveEntry>
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
readonly uris?: Array<ExtractedUri>
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* …). Every extractor defaults to `None` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent — the revision list is the separate
* audit trail.
*/
readonly revisions?: Array<DocumentRevision>
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
readonly structuredOutput?: JsonValue
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
*/
readonly codeIntelligence?: JsonValue
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `None` when no LLM was used.
*/
readonly llmUsage?: Array<LlmUsage>
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*/
readonly formattedContent?: string
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
readonly ocrInternalDocument?: string
}
/** FictionBook (FB2) metadata. */
export interface FictionBookMetadata {
readonly genres?: Array<string>
readonly sequences?: Array<string>
readonly annotation?: string
}
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` — `None` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
* - `max_concurrent_extractions` — controls batch parallelism
* - `use_cache` — global caching policy
* - `acceleration` — shared ONNX execution provider
* - `security_limits` — global archive security policy
*/
export interface FileExtractionConfig {
/** Override quality post-processing for this file. */
readonly enableQualityProcessing?: boolean
/** Override OCR configuration for this file (None in the Option = use batch default). */
readonly ocr?: OcrConfig
/** Override force OCR for this file. */
readonly forceOcr?: boolean
/** Override force OCR pages for this file (1-indexed page numbers). */
readonly forceOcrPages?: Array<number>
/** Override disable OCR for this file. */
readonly disableOcr?: boolean
/** Override chunking configuration for this file. */
readonly chunking?: ChunkingConfig
/** Override content filtering configuration for this file. */
readonly contentFilter?: ContentFilterConfig
/** Override image extraction configuration for this file. */
readonly images?: ImageExtractionConfig
/** Override PDF options for this file. */
readonly pdfOptions?: PdfConfig
/** Override token reduction for this file. */
readonly tokenReduction?: TokenReductionOptions
/** Override language detection for this file. */
readonly languageDetection?: LanguageDetectionConfig
/** Override page extraction for this file. */
readonly pages?: PageConfig
/** Override keyword extraction for this file. */
readonly keywords?: KeywordConfig
/** Override post-processor for this file. */
readonly postprocessor?: PostProcessorConfig
/** Override HTML conversion options for this file. */
readonly htmlOptions?: string
/** Override result format for this file. */
readonly resultFormat?: ResultFormat
/** Override output content format for this file. */
readonly outputFormat?: OutputFormat
/** Override document structure output for this file. */
readonly includeDocumentStructure?: boolean
/** Override layout detection for this file. */
readonly layout?: LayoutDetectionConfig
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
readonly timeoutSecs?: number
/** Override tree-sitter configuration for this file. */
readonly treeSitter?: TreeSitterConfig
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
readonly structuredExtraction?: StructuredExtractionConfig
}
/** Footnote in Djot. */
export interface Footnote {
/** Footnote label */
readonly label: string
/** Footnote content blocks */
readonly content: Array<FormattedBlock>
}
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
export type FormatMetadata =
| { format_type: 'pdf'; 0: PdfMetadata }
| { format_type: 'docx'; 0: DocxMetadata }
| { format_type: 'excel'; 0: ExcelMetadata }
| { format_type: 'email'; 0: EmailMetadata }
| { format_type: 'pptx'; 0: PptxMetadata }
| { format_type: 'archive'; 0: ArchiveMetadata }
| { format_type: 'image'; 0: ImageMetadata }
| { format_type: 'xml'; 0: XmlMetadata }
| { format_type: 'text'; 0: TextMetadata }
| { format_type: 'html'; 0: HtmlMetadata }
| { format_type: 'ocr'; 0: OcrMetadata }
| { format_type: 'csv'; 0: CsvMetadata }
| { format_type: 'bibtex'; 0: BibtexMetadata }
| { format_type: 'citation'; 0: CitationMetadata }
| { format_type: 'fiction_book'; 0: FictionBookMetadata }
| { format_type: 'dbf'; 0: DbfMetadata }
| { format_type: 'jats'; 0: JatsMetadata }
| { format_type: 'epub'; 0: EpubMetadata }
| { format_type: 'pst'; 0: PstMetadata }
| { format_type: 'code'; 0: string }
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
export interface FormattedBlock {
/** Type of block element */
readonly blockType: BlockType
/** Heading level (1-6) for headings, or nesting level for lists */
readonly level?: number
/** Inline content within the block */
readonly inlineContent: Array<InlineElement>
/** Element attributes (classes, IDs, key-value pairs) */
readonly attributes?: string
/** Language identifier for code blocks */
readonly language?: string
/** Raw code content for code blocks */
readonly code?: string
/** Nested blocks for containers (blockquotes, list items, divs) */
readonly children: Array<FormattedBlock>
}
/** Individual grid cell with position and span metadata. */
export interface GridCell {
/** Cell text content. */
readonly content: string
/** Zero-indexed row position. */
readonly row: number
/** Zero-indexed column position. */
readonly col: number
/** Number of rows this cell spans. */
readonly rowSpan: number
/** Number of columns this cell spans. */
readonly colSpan: number
/** Whether this is a header cell. */
readonly isHeader: boolean
/** Bounding box for this cell (if available). */
readonly bbox?: BoundingBox
}
/** Header/heading element metadata. */
export interface HeaderMetadata {
/** Header level: 1 (h1) through 6 (h6) */
readonly level: number
/** Normalized text content of the header */
readonly text: string
/** HTML id attribute if present */
readonly id?: string
/** Document tree depth at the header element */
readonly depth: number
/** Byte offset in original HTML document */
readonly htmlOffset: number
}
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
export interface HeadingContext {
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*/
readonly headings: Array<HeadingLevel>
}
/** A single heading in the hierarchy. */
export interface HeadingLevel {
/** Heading depth (1 = h1, 2 = h2, etc.) */
readonly level: number
/** The text content of the heading. */
readonly text: string
}
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
export interface HierarchicalBlock {
/** The text content of this block */
readonly text: string
/** The font size of the text in this block */
readonly fontSize: number
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*/
readonly level: string
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*/
readonly bbox?: Array<number>
}
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
export interface HierarchyConfig {
/** Enable hierarchy extraction */
readonly enabled?: boolean
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*/
readonly kClusters?: number
/** Include bounding box information in hierarchy blocks */
readonly includeBbox?: boolean
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*/
readonly ocrCoverageThreshold?: number
}
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
export interface HtmlMetadata {
/** Document title from `<title>` tag */
readonly title?: string
/** Document description from `<meta name="description">` tag */
readonly description?: string
/** Document keywords from `<meta name="keywords">` tag, split on commas */
readonly keywords?: Array<string>
/** Document author from `<meta name="author">` tag */
readonly author?: string
/** Canonical URL from `<link rel="canonical">` tag */
readonly canonicalUrl?: string
/** Base URL from `<base href="">` tag for resolving relative URLs */
readonly baseHref?: string
/** Document language from `lang` attribute */
readonly language?: string
/** Document text direction from `dir` attribute */
readonly textDirection?: TextDirection
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*/
readonly openGraph?: Record<string, string>
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*/
readonly twitterCard?: Record<string, string>
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*/
readonly metaTags?: Record<string, string>
/** Extracted header elements with hierarchy */
readonly headers?: Array<HeaderMetadata>
/** Extracted hyperlinks with type classification */
readonly links?: Array<LinkMetadata>
/** Extracted images with source and dimensions */
readonly images?: Array<ImageMetadataType>
/** Extracted structured data blocks */
readonly structuredData?: Array<StructuredData>
}
/**
* Configuration for styled HTML output.
*
* When set on [`ExtractionConfig::html_output`] alongside
* `output_format = OutputFormat::Html`, the pipeline builds a
* [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
*/
export interface HtmlOutputConfig {
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*/
readonly css?: string
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*/
readonly cssFile?: string
/** Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
readonly theme?: HtmlTheme
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*/
readonly classPrefix?: string
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*/
readonly embedCss?: boolean
}
/** Built-in HTML theme selection. */
export declare enum HtmlTheme {
/**
* Sensible defaults: system font stack, neutral colours, readable line
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
* can override individual values.
*/
Default = "default",
/** GitHub Markdown-inspired palette and spacing. */
GitHub = "github",
/** Dark background, light text. */
Dark = "dark",
/** Minimal light theme with generous whitespace. */
Light = "light",
/**
* No built-in stylesheet emitted. CSS custom properties are still defined
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
*/
Unstyled = "unstyled",
}
/** Image extraction configuration. */
export interface ImageExtractionConfig {
/** Extract images from documents */
readonly extractImages?: boolean
/** Target DPI for image normalization */
readonly targetDpi?: number
/** Maximum dimension for images (width or height) */
readonly maxImageDimension?: number
/**
* Whether to inject image reference placeholders into markdown output.
* When `true` (default), image references like `![Image 1](embedded:p1_i0)`
* are appended to the markdown. Set to `false` to extract images as data
* without polluting the markdown output.
*/
readonly injectPlaceholders?: boolean
/** Automatically adjust DPI based on image content */
readonly autoAdjustDpi?: boolean
/** Minimum DPI threshold */
readonly minDpi?: number
/** Maximum DPI threshold */
readonly maxDpi?: number
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* `None` (default) means no limit — all images are extracted.
*/
readonly maxImagesPerPage?: number
/**
* When `true` (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*/
readonly classify?: boolean
/**
* When `true`, full-page renders produced during OCR preprocessing are captured
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
*
* Defaults to `false`. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*/
readonly includePageRasters?: boolean
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
* are processed with the configured OCR backend. Set to `false` to extract images
* without OCR processing, even when OCR is enabled.
*/
readonly runOcrOnImages?: boolean
/**
* When `true`, image OCR results are rendered as plain text without the
* `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
* is also `true`.
*/
readonly ocrTextOnly?: boolean
/**
* When `true` and `ocr_text_only` is `false`, append the OCR text after
* the image placeholder in the rendered output.
*/
readonly appendOcrText?: boolean
}
/** Heuristic classification of what an image likely depicts. */
export declare enum ImageKind {
/** Photographic image (natural scene, photograph) */
Photograph = "photograph",
/** Technical or schematic diagram */
Diagram = "diagram",
/** Chart, graph, or plot */
Chart = "chart",
/** Freehand or technical drawing */
Drawing = "drawing",
/** Text-heavy image (scanned text, document) */
TextBlock = "text_block",
/** Decorative element or border */
Decoration = "decoration",
/** Logo or brand mark */
Logo = "logo",
/** Small icon */
Icon = "icon",
/** Fragment of a larger tiled image (tile of a technical drawing) */
TileFragment = "tile_fragment",
/** Mask or transparency map */
Mask = "mask",
/** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
PageRaster = "page_raster",
/** Could not classify with reasonable confidence */
Unknown = "unknown",
}
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
export interface ImageMetadata {
/** Image width in pixels */
readonly width?: number
/** Image height in pixels */
readonly height?: number
/** Image format (e.g., "PNG", "JPEG", "TIFF") */
readonly format?: string
/** EXIF metadata tags */
readonly exif?: Record<string, string>
}
/** Image element metadata. */
export interface ImageMetadataType {
/** Image source (URL, data URI, or SVG content) */
readonly src: string
/** Alternative text from alt attribute */
readonly alt?: string
/** Title attribute */
readonly title?: string
/** Image dimensions as (width, height) if available */
readonly dimensions?: Array<number>
/** Image type classification */
readonly imageType: ImageType
/** Additional attributes as key-value pairs */
readonly attributes: Array<Array<string>>
}
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
export interface ImagePreprocessingConfig {
/** Target DPI for the image (300 is standard, 600 for small text). */
readonly targetDpi?: number
/** Auto-detect and correct image rotation. */
readonly autoRotate?: boolean
/** Correct skew (tilted images). */
readonly deskew?: boolean
/** Remove noise from the image. */
readonly denoise?: boolean
/** Enhance contrast for better text visibility. */
readonly contrastEnhance?: boolean
/** Binarization method: "otsu", "sauvola", "adaptive". */
readonly binarizationMethod?: string
/** Invert colors (white text on black → black on white). */
readonly invertColors?: boolean
}
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
export interface ImagePreprocessingMetadata {
/** Original image dimensions (width, height) in pixels */
readonly originalDimensions: Array<number>
/** Original image DPI (horizontal, vertical) */
readonly originalDpi: Array<number>
/** Target DPI from configuration */
readonly targetDpi: number
/** Scaling factor applied to the image */
readonly scaleFactor: number
/** Whether DPI was auto-adjusted based on content */
readonly autoAdjusted: boolean
/** Final DPI after processing */
readonly finalDpi: number
/** New dimensions after resizing (if resized) */
readonly newDimensions?: Array<number>
/** Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
readonly resampleMethod: string
/** Whether dimensions were clamped to max_image_dimension */
readonly dimensionClamped: boolean
/** Calculated optimal DPI (if auto_adjust_dpi enabled) */
readonly calculatedDpi?: number
/** Whether resize was skipped (dimensions already optimal) */
readonly skippedResize: boolean
/** Error message if resize failed */
readonly resizeError?: string
}
/** Image type classification. */
export declare enum ImageType {
/** Data URI image */
DataUri = "data-uri",
/** Inline SVG */
InlineSvg = "inline-svg",
/** External image URL */
External = "external",
/** Relative path image */
Relative = "relative",
}
/**
* Inline element within a block.
*
* Represents text with formatting, links, images, etc.
*/
export interface InlineElement {
/** Type of inline element */
readonly elementType: InlineType
/** Text content */
readonly content: string
/** Element attributes */
readonly attributes?: string
/** Additional metadata (e.g., href for links, src/alt for images) */
readonly metadata?: Record<string, string>
}
/** Types of inline elements in Djot. */
export declare enum InlineType {
Text = "text",
Strong = "strong",
Emphasis = "emphasis",
Highlight = "highlight",
Subscript = "subscript",
Superscript = "superscript",
Insert = "insert",
Delete = "delete",
Code = "code",
Link = "link",
Image = "image",
Span = "span",
Math = "math",
RawInline = "raw_inline",
FootnoteRef = "footnote_ref",
Symbol = "symbol",
}
/** JATS (Journal Article Tag Suite) metadata. */
export interface JatsMetadata {
readonly copyright?: string
readonly license?: string
readonly historyDates?: Record<string, string>
readonly contributorRoles?: Array<ContributorRole>
}
/** Extracted keyword with metadata. */
export interface Keyword {
/** The keyword text. */
readonly text: string
/** Relevance score (higher is better, algorithm-specific range). */
readonly score: number
/** Algorithm that extracted this keyword. */
readonly algorithm: KeywordAlgorithm
/** Optional positions where keyword appears in text (character offsets). */
readonly positions?: Array<number>
}
/** Keyword algorithm selection. */
export declare enum KeywordAlgorithm {
/** YAKE (Yet Another Keyword Extractor) - statistical approach */
Yake = "yake",
/** RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based */
Rake = "rake",
}
/** Keyword extraction configuration. */
export interface KeywordConfig {
/** Algorithm to use for extraction. */
readonly algorithm?: KeywordAlgorithm
/** Maximum number of keywords to extract (default: 10). */
readonly maxKeywords?: number
/**
* Minimum score threshold (0.0-1.0, default: 0.0).
*
* Keywords with scores below this threshold are filtered out.
* Note: Score ranges differ between algorithms.
*/
readonly minScore?: number
/**
* N-gram range for keyword extraction (min, max).
*
* (1, 1) = unigrams only
* (1, 2) = unigrams and bigrams
* (1, 3) = unigrams, bigrams, and trigrams (default)
*/
readonly ngramRange?: Array<number>
/**
* Language code for stopword filtering (e.g., "en", "de", "fr").
*
* If None, no stopword filtering is applied.
*/
readonly language?: string
/** YAKE-specific tuning parameters. */
readonly yakeParams?: YakeParams
/** RAKE-specific tuning parameters. */
readonly rakeParams?: RakeParams
}
/** Language detection configuration. */
export interface LanguageDetectionConfig {
/** Enable language detection */
readonly enabled?: boolean
/** Minimum confidence threshold (0.0-1.0) */
readonly minConfidence?: number
/** Detect multiple languages in the document */
readonly detectMultiple?: boolean
}
/**
* The 17 canonical document layout classes.
*
* All model backends (RT-DETR, YOLO, etc.) map their native class IDs
* to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
* map to the closest equivalent.
*
* Wire format is snake_case in all serializers (JSON, TOML, YAML).
*/
export declare enum LayoutClass {
Caption = "caption",
Footnote = "footnote",
Formula = "formula",
ListItem = "list_item",
PageFooter = "page_footer",
PageHeader = "page_header",
Picture = "picture",
SectionHeader = "section_header",
Table = "table",
Text = "text",
Title = "title",
DocumentIndex = "document_index",
Code = "code",
CheckboxSelected = "checkbox_selected",
CheckboxUnselected = "checkbox_unselected",
Form = "form",
KeyValueRegion = "key_value_region",
}
/** A single layout detection result. */
export interface LayoutDetection {
readonly className: LayoutClass
readonly confidence: number
readonly bbox: BBox
}
/**
* Layout detection configuration.
*
* Controls layout detection behavior in the extraction pipeline.
* When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
* is enabled for PDF extraction.
*/
export interface LayoutDetectionConfig {
/** Confidence threshold override (None = use model default). */
readonly confidenceThreshold?: number
/** Whether to apply postprocessing heuristics (default: true). */
readonly applyHeuristics?: boolean
/**
* Table structure recognition model.
*
* Controls which model is used for table cell detection within layout-detected
* table regions. Defaults to [`TableModel::Tatr`].
*/
readonly tableModel?: TableModel
/**
* Hardware acceleration for ONNX models (layout detection + table structure).
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*/
readonly acceleration?: AccelerationConfig
}
/**
* A detected layout region on a page.
*
* When layout detection is enabled, each page may have layout regions
* identifying different content types (text, pictures, tables, etc.)
* with confidence scores and spatial positions.
*/
export interface LayoutRegion {
/** Layout class name (e.g. "picture", "table", "text", "section_header"). */
readonly className?: string
/** Confidence score from the layout detection model (0.0 to 1.0). */
readonly confidence?: number
/** Bounding box in document coordinate space. */
readonly boundingBox?: BoundingBox
/** Fraction of the page area covered by this region (0.0 to 1.0). */
readonly areaFraction?: number
}
/** Link element metadata. */
export interface LinkMetadata {
/** The href URL value */
readonly href: string
/** Link text content (normalized) */
readonly text: string
/** Optional title attribute */
readonly title?: string
/** Link type classification */
readonly linkType: LinkType
/** Rel attribute values */
readonly rel: Array<string>
/** Additional attributes as key-value pairs */
readonly attributes: Array<Array<string>>
}
/** Link type classification. */
export declare enum LinkType {
/** Anchor link (#section) */
Anchor = "anchor",
/** Internal link (same domain) */
Internal = "internal",
/** External link (different domain) */
External = "external",
/** Email link (mailto:) */
Email = "email",
/** Phone link (tel:) */
Phone = "phone",
/** Other link type */
Other = "other",
}
/** Type of list detection. */
export declare enum ListType {
/** Bullet points (-, *, •, etc.) */
Bullet = "Bullet",
/** Numbered lists (1., 2., etc.) */
Numbered = "Numbered",
/** Lettered lists (a., b., A., B., etc.) */
Lettered = "Lettered",
/** Indented items */
Indented = "Indented",
}
/**
* Configuration for an LLM provider/model via liter-llm.
*
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
* its own `LlmConfig`, allowing different providers per feature.
* @example
* ```typescript
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
* ```typescript
*/
export interface LlmConfig {
/**
* Provider/model string using liter-llm routing format.
*
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
* `"groq/llama-3.1-70b-versatile"`.
*/
readonly model?: string
/**
* API key for the provider. When `None`, liter-llm falls back to
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
*/
readonly apiKey?: string
/** Custom base URL override for the provider endpoint. */
readonly baseUrl?: string
/** Request timeout in seconds (default: 60). */
readonly timeoutSecs?: number
/** Maximum retry attempts (default: 3). */
readonly maxRetries?: number
/** Sampling temperature for generation tasks. */
readonly temperature?: number
/** Maximum tokens to generate. */
readonly maxTokens?: number
}
/**
* Token usage and cost data for a single LLM call made during extraction.
*
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
* are used. Multiple entries may be present when multiple LLM calls occur
* within one extraction (e.g. VLM OCR + structured extraction).
*/
export interface LlmUsage {
/** The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
readonly model?: string
/**
* The pipeline stage that triggered this LLM call
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
*/
readonly source?: string
/** Number of input/prompt tokens consumed. */
readonly inputTokens?: number
/** Number of output/completion tokens generated. */
readonly outputTokens?: number
/** Total tokens (input + output). */
readonly totalTokens?: number
/** Estimated cost in USD based on the provider's published pricing. */
readonly estimatedCost?: number
/** Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
readonly finishReason?: string
}
/**
* Extraction result metadata.
*
* Contains common fields applicable to all formats, format-specific metadata
* via a discriminated union, and additional custom fields from postprocessors.
*/
export interface Metadata {
/** Document title */
readonly title?: string
/** Document subject or description */
readonly subject?: string
/** Primary author(s) - always Vec for consistency */
readonly authors?: Array<string>
/** Keywords/tags - always Vec for consistency */
readonly keywords?: Array<string>
/** Primary language (ISO 639 code) */
readonly language?: string
/** Creation timestamp (ISO 8601 format) */
readonly createdAt?: string
/** Last modification timestamp (ISO 8601 format) */
readonly modifiedAt?: string
/** User who created the document */
readonly createdBy?: string
/** User who last modified the document */
readonly modifiedBy?: string
/** Page/slide/sheet structure with boundaries */
readonly pages?: PageStructure
/**
* Format-specific metadata (discriminated union)
*
* Contains detailed metadata specific to the document format.
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
*/
readonly format?: FormatMetadata
/** Image preprocessing metadata (when OCR preprocessing was applied) */
readonly imagePreprocessing?: ImagePreprocessingMetadata
/** JSON schema (for structured data extraction) */
readonly jsonSchema?: JsonValue
/** Error metadata (for batch operations) */
readonly error?: ErrorMetadata
/**
* Extraction duration in milliseconds (for benchmarking).
*
* This field is populated by batch extraction to provide per-file timing
* information. It's `None` for single-file extraction (which uses external timing).
*/
readonly extractionDurationMs?: number
/** Document category (from frontmatter or classification). */
readonly category?: string
/** Document tags (from frontmatter). */
readonly tags?: Array<string>
/** Document version string (from frontmatter). */
readonly documentVersion?: string
/** Abstract or summary text (from frontmatter). */
readonly abstractText?: string
/**
* Output format identifier (e.g., "markdown", "html", "text").
*
* Set by the output format pipeline stage when format conversion is applied.
* Previously stored in `metadata.additional["output_format"]`.
*/
readonly outputFormat?: string
/**
* Whether OCR was used during extraction.
*
* Set to `true` whenever the extraction pipeline ran an OCR backend
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
* or fallback text. `false` means native text extraction was used exclusively.
*/
readonly ocrUsed?: boolean
/**
* Additional custom fields from postprocessors.
*
* Serialized as a nested `"additional"` object (not flattened at root level).
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
*/
readonly additional?: Record<string, JsonValue>
}
/** Combined paths to all models needed for OCR (backward compatibility). */
export interface ModelPaths {
/** Path to the detection model directory. */
readonly detModel: string
/** Path to the classification model directory. */
readonly clsModel: string
/** Path to the recognition model directory. */
readonly recModel: string
/** Path to the character dictionary file. */
readonly dictFile: string
}
/**
* Tagged enum for node content. Each variant carries only type-specific data.
*
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
* Go/Java/TypeScript bindings.
*/
export type NodeContent =
| { node_type: 'title'; text: string }
| { node_type: 'heading'; level: number; text: string }
| { node_type: 'paragraph'; text: string }
| { node_type: 'list'; ordered: boolean }
| { node_type: 'list_item'; text: string }
| { node_type: 'table'; grid: TableGrid }
| { node_type: 'image'; description: string; imageIndex: number; src: string }
| { node_type: 'code'; text: string; language: string }
| { node_type: 'quote' }
| { node_type: 'formula'; text: string }
| { node_type: 'footnote'; text: string }
| { node_type: 'group'; label: string; headingLevel: number; headingText: string }
| { node_type: 'page_break' }
| { node_type: 'slide'; number: number; title: string }
| { node_type: 'definition_list' }
| { node_type: 'definition_item'; term: string; definition: string }
| { node_type: 'citation'; key: string; text: string }
| { node_type: 'admonition'; kind: string; title: string }
| { node_type: 'raw_block'; format: string; content: string }
| { node_type: 'metadata_block'; entries: Array<Array<string>> }
/**
* Trait for OCR backend plugins.
*
* Implement this trait to add custom OCR capabilities. OCR backends can be:
* - Native Rust implementations (like Tesseract)
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
*
* # Thread Safety
*
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
*/
export interface OcrBackend {
name(): string
/**
* Process an image and extract text via OCR.
* @param image_bytes - Raw image data (JPEG, PNG, TIFF, etc.)
*
* @param config - OCR configuration (language, PSM mode, etc.)
*
* @returns An `ExtractionResult` containing the extracted text and metadata.
*
* @throws - `KreuzbergError::Ocr` - OCR processing failed
* - `KreuzbergError::Validation` - Invalid image format or configuration
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
*
* # Reading `backend_options`
*
* Backends that support runtime tuning can read `config.backend_options` and
* deserialize only the keys they care about. Unknown keys are silently ignored,
* so multiple backends can coexist in a pipeline without key conflicts.
*
* ```rust
* async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
* // Read backend-specific options; unknown keys are silently ignored.
* let fast_mode = config.backend_options
* .as_ref()
* .and_then(|v| v.get("mode"))
* .and_then(|v| v.as_str())
* .map(|s| s == "fast")
* .unwrap_or(false);
*
* if image_bytes.is_empty() {
* return Err(kreuzberg::KreuzbergError::Validation {
* message: "Empty image data".to_string(),
* source: None,
* });
* }
*
* let text = if fast_mode {
* "Fast OCR result".to_string()
* } else {
* format!("Extracted text in language: {}", config.language)
* };
*
* Ok(ExtractionResult {
* content: text,
* mime_type: Cow::Borrowed("text/plain"),
* ..Default::default()
* })
* }
* ```
*/
processImage(imageBytes: Uint8Array, config?: OcrConfig | undefined | null): Promise<string>
/**
* Process a file and extract text via OCR.
*
* Default implementation reads the file and calls `process_image`.
* Override for custom file handling or optimizations.
* @param path - Path to the image file
*
* @param config - OCR configuration
*
* @throws Same as `process_image`, plus file I/O errors.
*/
processImageFile?(path: string, config?: OcrConfig | undefined | null): Promise<string>
/**
* Check if this backend supports a given language code.
* @param lang - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
*
* @returns `true` if the language is supported, `false` otherwise.
*/
supportsLanguage(lang: string): string
/**
* Get the backend type identifier.
* @returns The backend type enum value.
*/
backendType(): string
/**
* Optional: Get a list of all supported languages.
*
* Defaults to empty list. Override to provide comprehensive language support info.
*/
supportedLanguages?(): string
/**
* Optional: Check if the backend supports table detection.
*
* Defaults to `false`. Override if your backend can detect and extract tables.
*/
supportsTableDetection?(): string
/**
* Check if the backend supports direct document-level processing (e.g. for PDFs).
*
* Defaults to `false`. Override if the backend has optimized document processing.
*/
supportsDocumentProcessing?(): string
/**
* Process a document file directly via OCR.
*
* Only called if `supports_document_processing` returns `true`.
* @param path - Path to the document file (e.g. .pdf)
*
* @param config - OCR configuration
*/
processDocument?(path: string, config?: OcrConfig | undefined | null): Promise<string>
}
/** OCR backend types. */
export declare enum OcrBackendType {
/** Tesseract OCR (native Rust binding) */
Tesseract = "Tesseract",
/** EasyOCR (Python-based, via FFI) */
EasyOCR = "EasyOCR",
/** PaddleOCR (Python-based, via FFI) */
PaddleOCR = "PaddleOCR",
/** Custom/third-party OCR backend */
Custom = "Custom",
}
/**
* Bounding geometry for an OCR element.
*
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
* (from PaddleOCR and rotated text detection).
*/
export type OcrBoundingGeometry =
| { type: 'rectangle'; left: number; top: number; width: number; height: number }
| { type: 'quadrilateral'; points: string }
/**
* Confidence scores for an OCR element.
*
* Separates detection confidence (how confident that text exists at this location)
* from recognition confidence (how confident about the actual text content).
*/
export interface OcrConfidence {
/**
* Detection confidence: how confident the OCR engine is that text exists here.
*
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
* Range: 0.0 to 1.0 (or None if not available).
*/
readonly detection?: number
/**
* Recognition confidence: how confident about the text content.
*
* Range: 0.0 to 1.0.
*/
readonly recognition?: number
}
/** OCR configuration. */
export interface OcrConfig {
/**
* Whether OCR is enabled.
*
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
* metadata only; PDFs use native text extraction without OCR fallback.
*
* Defaults to `true`. When `false`, all other OCR settings are ignored.
*/
readonly enabled?: boolean
/** OCR backend: tesseract, easyocr, paddleocr */
readonly backend?: string
/** Language code (e.g., "eng", "deu") */
readonly language?: string
/** Tesseract-specific configuration (optional) */
readonly tesseractConfig?: TesseractConfig
/** Output format for OCR results (optional, for format conversion) */
readonly outputFormat?: OutputFormat
/** PaddleOCR-specific configuration (optional, JSON passthrough) */
readonly paddleOcrConfig?: JsonValue
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Custom OCR backends and built-in backends that support runtime tuning
* can read this value and deserialize the keys they care about. Keys
* unknown to the backend are silently ignored.
*
* This is the recommended extension point for per-call parameters that
* are not covered by the typed fields above (e.g. mode switching,
* preprocessing flags, inference batch size).
*
* **Scope:** when `pipeline` is `None`, this value is propagated to the
* primary stage of the auto-constructed pipeline. When `pipeline` is
* explicitly set, this field has **no effect** — the caller must set
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
* instead.
*
* Example:
* ```json
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
* ```
*/
readonly backendOptions?: JsonValue
/** OCR element extraction configuration */
readonly elementConfig?: OcrElementConfig
/**
* Quality thresholds for the native-text-to-OCR fallback decision.
* When None, uses compiled defaults (matching previous hardcoded behavior).
*/
readonly qualityThresholds?: OcrQualityThresholds
/**
* Multi-backend OCR pipeline configuration. When set, enables weighted
* fallback across multiple OCR backends based on output quality.
* When None, uses the single `backend` field (same as today).
*/
readonly pipeline?: OcrPipelineConfig
/**
* Enable automatic page rotation based on orientation detection.
*
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
* page orientation (0/90/180/270 degrees) before OCR. If the page is
* rotated with high confidence, the image is corrected before recognition.
* This is critical for handling rotated scanned documents.
*/
readonly autoRotate?: boolean
/**
* VLM (Vision Language Model) OCR configuration.
*
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
* images to a vision model for text extraction.
*/
readonly vlmConfig?: LlmConfig
/**
* Custom Jinja2 prompt template for VLM OCR.
*
* When `None`, uses the default template. Available variables:
* - `{{ language }}` — The document language code (e.g., "eng", "deu").
*/
readonly vlmPrompt?: string
/**
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
*
* Not user-configurable via config files — injected at runtime from
* `ExtractionConfig::acceleration` before each `process_image` call.
*/
readonly acceleration?: AccelerationConfig
/**
* Caller-supplied Tesseract `traineddata` bytes per language code.
*
* Primary use case is the WASM build, which has no filesystem and cannot
* download tessdata at runtime. Native builds typically rely on
* `TessdataManager` and ignore this field. When present, the WASM
* Tesseract backend prefers these bytes over its compile-time-bundled
* English data.
*
* Skipped by serde to keep config files small — supply via the typed API
* at runtime.
*/
readonly tessdataBytes?: Record<string, Uint8Array>
}
/**
* A unified OCR element representing detected text with full metadata.
*
* This is the primary type for structured OCR output, preserving all information
* from both Tesseract and PaddleOCR backends.
*/
export interface OcrElement {
/** The recognized text content. */
readonly text?: string
/** Bounding geometry (rectangle or quadrilateral). */
readonly geometry?: OcrBoundingGeometry
/** Confidence scores for detection and recognition. */
readonly confidence?: OcrConfidence
/** Hierarchical level (word, line, block, page). */
readonly level?: OcrElementLevel
/** Rotation information (if detected). */
readonly rotation?: OcrRotation
/** Page number (1-indexed). */
readonly pageNumber?: number
/**
* Parent element ID for hierarchical relationships.
*
* Only used for Tesseract output which has word -> line -> block hierarchy.
*/
readonly parentId?: string
/** Backend-specific metadata that doesn't fit the unified schema. */
readonly backendMetadata?: Record<string, JsonValue>
}
/**
* Configuration for OCR element extraction.
*
* Controls how OCR elements are extracted and filtered.
*/
export interface OcrElementConfig {
/**
* Whether to include OCR elements in the extraction result.
*
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
*/
readonly includeElements?: boolean
/**
* Minimum hierarchical level to include.
*
* Elements below this level (e.g., words when min_level is Line) will be excluded.
*/
readonly minLevel?: OcrElementLevel
/**
* Minimum recognition confidence threshold (0.0-1.0).
*
* Elements with confidence below this threshold will be filtered out.
*/
readonly minConfidence?: number
/**
* Whether to build hierarchical relationships between elements.
*
* When true, `parent_id` fields will be populated based on spatial containment.
* Only meaningful for Tesseract output.
*/
readonly buildHierarchy?: boolean
}
/**
* Hierarchical level of an OCR element.
*
* Maps to Tesseract's page segmentation hierarchy and provides
* equivalent semantics for PaddleOCR.
*/
export declare enum OcrElementLevel {
/** Individual word */
Word = "word",
/** Line of text (default for PaddleOCR) */
Line = "line",
/** Paragraph or text block */
Block = "block",
/** Page-level element */
Page = "page",
}
/**
* OCR extraction result.
*
* Result of performing OCR on an image or scanned document,
* including recognized text and detected tables.
*/
export interface OcrExtractionResult {
/** Recognized text content */
readonly content: string
/** Original MIME type of the processed image */
readonly mimeType: string
/** OCR processing metadata (confidence scores, language, etc.) */
readonly metadata: Record<string, JsonValue>
/** Tables detected and extracted via OCR */
readonly tables: Array<OcrTable>
/**
* Structured OCR elements with bounding boxes and confidence scores.
* Available when TSV output is requested or table detection is enabled.
*/
readonly ocrElements?: Array<OcrElement>
/**
* Structured document produced from hOCR parsing.
* Carries paragraph structure, bounding boxes, and confidence scores
* that the flattened `content` string discards.
*/
readonly internalDocument?: string
}
/**
* OCR processing metadata.
*
* Captures information about OCR processing configuration and results.
*/
export interface OcrMetadata {
/** OCR language code(s) used */
readonly language?: string
/** Tesseract Page Segmentation Mode (PSM) */
readonly psm?: number
/** Output format (e.g., "text", "hocr") */
readonly outputFormat?: string
/** Number of tables detected */
readonly tableCount?: number
readonly tableRows?: number
readonly tableCols?: number
}
/**
* Multi-backend OCR pipeline with quality-based fallback.
*
* Backends are tried in priority order (highest first). After each backend
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
* the result is accepted. Otherwise the next backend is tried.
*/
export interface OcrPipelineConfig {
/** Ordered list of backends to try. Sorted by priority (descending) at runtime. */
readonly stages: Array<OcrPipelineStage>
/** Quality thresholds for deciding whether to accept a result or try the next backend. */
readonly qualityThresholds: OcrQualityThresholds
}
/** A single backend stage in the OCR pipeline. */
export interface OcrPipelineStage {
/** Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
readonly backend: string
/** Priority weight (higher = tried first). Stages are sorted by priority descending. */
readonly priority: number
/** Language override for this stage (None = use parent OcrConfig.language). */
readonly language?: string
/** Tesseract-specific config override for this stage. */
readonly tesseractConfig?: TesseractConfig
/** PaddleOCR-specific config for this stage. */
readonly paddleOcrConfig?: JsonValue
/** VLM config override for this pipeline stage. */
readonly vlmConfig?: LlmConfig
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Backends that support runtime tuning (mode switching, preprocessing
* flags, inference parameters, etc.) read this value and deserialize
* the keys they care about. Keys unknown to the backend are silently
* ignored, so options from different backends can coexist in the same
* config without conflict.
*
* Example (custom backend):
* ```json
* { "mode": "fast", "enable_layout": true }
* ```
*/
readonly backendOptions?: JsonValue
}
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so `OcrQualityThresholds::default()` preserves existing semantics exactly.
*/
export interface OcrQualityThresholds {
/** Minimum total non-whitespace characters to consider text substantive. */
readonly minTotalNonWhitespace?: number
/** Minimum non-whitespace characters per page on average. */
readonly minNonWhitespacePerPage?: number
/** Minimum character count for a word to be "meaningful". */
readonly minMeaningfulWordLen?: number
/** Minimum count of meaningful words before text is accepted. */
readonly minMeaningfulWords?: number
/** Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
readonly minAlnumRatio?: number
/** Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
readonly minGarbageChars?: number
/** Maximum fraction of short (1-2 char) words before text is considered fragmented. */
readonly maxFragmentedWordRatio?: number
/**
* Critical fragmentation threshold — triggers OCR regardless of meaningful words.
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
*/
readonly criticalFragmentedWordRatio?: number
/** Minimum average word length. Below this with enough words indicates garbled extraction. */
readonly minAvgWordLength?: number
/** Minimum word count before average word length check applies. */
readonly minWordsForAvgLengthCheck?: number
/** Minimum consecutive word repetition ratio to detect column scrambling. */
readonly minConsecutiveRepeatRatio?: number
/** Minimum word count before consecutive repetition check is applied. */
readonly minWordsForRepeatCheck?: number
/** Minimum character count for "substantive markdown" OCR skip gate. */
readonly substantiveMinChars?: number
/** Minimum character count for "non-text content" OCR skip gate. */
readonly nonTextMinChars?: number
/** Alphanumeric+whitespace ratio threshold for skip decisions. */
readonly alnumWsRatioThreshold?: number
/**
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
* If the result from a backend scores below this, try the next backend.
*/
readonly pipelineMinQuality?: number
}
/** Rotation information for an OCR element. */
export interface OcrRotation {
/** Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
readonly angleDegrees: number
/** Confidence score for the rotation detection. */
readonly confidence?: number
}
/**
* Table detected via OCR.
*
* Represents a table structure recognized during OCR processing.
*/
export interface OcrTable {
/** Table cells as a 2D vector (rows × columns) */
readonly cells: Array<Array<string>>
/** Markdown representation of the table */
readonly markdown: string
/** Page number where the table was found (1-indexed) */
readonly pageNumber: number
/** Bounding box of the table in pixel coordinates (from OCR word positions). */
readonly boundingBox?: OcrTableBoundingBox
}
/** Bounding box for an OCR-detected table in pixel coordinates. */
export interface OcrTableBoundingBox {
/** Left x-coordinate (pixels) */
readonly left: number
/** Top y-coordinate (pixels) */
readonly top: number
/** Right x-coordinate (pixels) */
readonly right: number
/** Bottom y-coordinate (pixels) */
readonly bottom: number
}
/** Document orientation detection result. */
export interface OrientationResult {
/** Detected orientation in degrees (0, 90, 180, or 270). */
readonly degrees: number
/** Confidence score (0.0-1.0). */
readonly confidence: number
}
/**
* Output format for extraction results.
*
* Controls the format of the `content` field in `ExtractionResult`.
* When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
* `Plain` returns the raw extracted text.
* `Structured` returns JSON with full OCR element data including bounding
* boxes and confidence scores.
*/
export declare enum OutputFormat {
/** Plain text content only (default) */
Plain = "plain",
/** Markdown format */
Markdown = "markdown",
/** Djot markup format */
Djot = "djot",
/** HTML format */
Html = "html",
/** JSON tree format with heading-driven sections. */
Json = "json",
/** Structured JSON format with full OCR element metadata. */
Structured = "structured",
/**
* Custom renderer registered via the RendererRegistry.
* The string is the renderer name (e.g., "docx", "latex").
*/
Custom = "custom",
}
/**
* Supported languages in PaddleOCR.
*
* Maps user-friendly language codes to paddle-ocr-rs language identifiers.
*/
export declare enum PaddleLanguage {
/** English */
English = "English",
/** Simplified Chinese */
Chinese = "Chinese",
/** Japanese */
Japanese = "Japanese",
/** Korean */
Korean = "Korean",
/** German */
German = "German",
/** French */
French = "French",
/** Latin script (covers most European languages) */
Latin = "Latin",
/** Cyrillic (Russian and related) */
Cyrillic = "Cyrillic",
/** Traditional Chinese */
TraditionalChinese = "TraditionalChinese",
/** Thai */
Thai = "Thai",
/** Greek */
Greek = "Greek",
/** East Slavic (Russian, Ukrainian, Belarusian) */
EastSlavic = "EastSlavic",
/** Arabic (Arabic, Persian, Urdu) */
Arabic = "Arabic",
/** Devanagari (Hindi, Marathi, Sanskrit, Nepali) */
Devanagari = "Devanagari",
/** Tamil */
Tamil = "Tamil",
/** Telugu */
Telugu = "Telugu",
}
/**
* Configuration for PaddleOCR backend.
*
* Configures PaddleOCR text detection and recognition with multi-language support.
* Uses a builder pattern for convenient configuration.
*/
export interface PaddleOcrConfig {
/** Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
readonly language?: string
/** Optional custom cache directory for model files */
readonly cacheDir?: string
/**
* Enable angle classification for rotated text (default: false).
* Can misfire on short text regions, rotating crops incorrectly before recognition.
*/
readonly useAngleCls?: boolean
/** Enable table structure detection (default: false) */
readonly enableTableDetection?: boolean
/**
* Database threshold for text detection (default: 0.3)
* Range: 0.0-1.0, higher values require more confident detections
*/
readonly detDbThresh?: number
/**
* Box threshold for text bounding box refinement (default: 0.5)
* Range: 0.0-1.0
*/
readonly detDbBoxThresh?: number
/**
* Unclip ratio for expanding text bounding boxes (default: 1.6)
* Controls the expansion of detected text regions
*/
readonly detDbUnclipRatio?: number
/**
* Maximum side length for detection image (default: 960)
* Larger images may be resized to this limit for faster inference
*/
readonly detLimitSideLen?: number
/**
* Batch size for recognition inference (default: 6)
* Number of text regions to process simultaneously
*/
readonly recBatchNum?: number
/**
* Padding in pixels added around the image before detection (default: 10).
* Large values can include surrounding content like table gridlines.
*/
readonly padding?: number
/**
* Minimum recognition confidence score for text lines (default: 0.5).
* Text regions with recognition confidence below this threshold are discarded.
* Matches PaddleOCR Python's `drop_score` parameter.
* Range: 0.0-1.0
*/
readonly dropScore?: number
/**
* Model tier controlling detection/recognition model size and accuracy trade-off.
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
*/
readonly modelTier?: string
}
/**
* Byte offset boundary for a page.
*
* Tracks where a specific page's content starts and ends in the main content string,
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
*/
export interface PageBoundary {
/** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
readonly byteStart: number
/** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
readonly byteEnd: number
/** Page number (1-indexed) */
readonly pageNumber: number
}
/**
* Page extraction and tracking configuration.
*
* Controls how pages are extracted, tracked, and represented in the extraction results.
* When `None`, page tracking is disabled.
*
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
* when page boundaries are available and chunking is configured.
*/
export interface PageConfig {
/** Extract pages as separate array (ExtractionResult.pages) */
readonly extractPages?: boolean
/** Insert page markers in main content string */
readonly insertPageMarkers?: boolean
/**
* Page marker format (use {page_num} placeholder)
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
*/
readonly markerFormat?: string
}
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
export interface PageContent {
/** Page number (1-indexed) */
readonly pageNumber: number
/** Text content for this page */
readonly content: string
/**
* Tables found on this page (uses Arc for memory efficiency)
*
* Serializes as Vec<Table> for JSON compatibility while maintaining
* Arc semantics in-memory for zero-copy sharing.
*/
readonly tables: Array<Table>
/**
* Indices into `ExtractionResult.images` for images found on this page.
*
* Each value is a zero-based index into the top-level `images` collection.
* Only populated when `extract_images = true` in the extraction config.
*/
readonly imageIndices: Array<number>
/**
* Hierarchy information for the page (when hierarchy extraction is enabled)
*
* Contains text hierarchy levels (H1-H6) extracted from the page content.
*/
readonly hierarchy?: PageHierarchy
/**
* Whether this page is blank (no meaningful text content)
*
* Determined during extraction based on text content analysis.
* A page is blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images.
*/
readonly isBlank?: boolean
/**
* Layout detection regions for this page (when layout detection is enabled).
*
* Contains detected layout regions with class, confidence, bounding box,
* and area fraction. Only populated when layout detection is configured.
*/
readonly layoutRegions?: Array<LayoutRegion>
/**
* Speaker notes for this slide (PPTX only).
*
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
* Only populated when the source is a PPTX file and notes are present.
*/
readonly speakerNotes?: string
/**
* Section name this slide belongs to (PPTX only).
*
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
* the slide belongs to a named section.
*/
readonly sectionName?: string
/**
* Sheet name for this page (XLSX/ODS only).
*
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
* formats and for sheets with an empty name.
*/
readonly sheetName?: string
}
/**
* Page hierarchy structure containing heading levels and block information.
*
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
* blocks with heading levels (H1-H6) for semantic document structure.
*/
export interface PageHierarchy {
/** Number of hierarchy blocks on this page */
readonly blockCount: number
/** Hierarchical blocks with heading levels */
readonly blocks: Array<HierarchicalBlock>
}
/**
* Metadata for individual page/slide/sheet.
*
* Captures per-page information including dimensions, content counts,
* and visibility state (for presentations).
*/
export interface PageInfo {
/** Page number (1-indexed) */
readonly number: number
/** Page title (usually for presentations) */
readonly title?: string
/** Dimensions in points (PDF) or pixels (images): (width, height) */
readonly dimensions?: Array<number>
/** Number of images on this page */
readonly imageCount?: number
/** Number of tables on this page */
readonly tableCount?: number
/** Whether this page is hidden (e.g., in presentations) */
readonly hidden?: boolean
/**
* Whether this page is blank (no meaningful text, no images, no tables)
*
* A page is considered blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images. This is useful for filtering out empty pages
* in scanned documents or PDFs with blank separator pages.
*/
readonly isBlank?: boolean
/**
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
*
* Indicates the presence of vector-drawn content such as charts, diagrams,
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
* invisible to `ExtractionResult.images` since they are not embedded as raster
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
* signaling that downstream consumers may want to rasterize the page to
* capture this content.
*
* Only populated for PDFs; `None` for other document types.
*/
readonly hasVectorGraphics: boolean
}
/**
* Unified page structure for documents.
*
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
* with character offset boundaries for chunk-to-page mapping.
*/
export interface PageStructure {
/** Total number of pages/slides/sheets */
readonly totalCount: number
/** Type of paginated unit */
readonly unitType: PageUnitType
/**
* Character offset boundaries for each page
*
* Maps character ranges in the extracted content to page numbers.
* Used for chunk page range calculation.
*/
readonly boundaries?: Array<PageBoundary>
/** Detailed per-page metadata (optional, only when needed) */
readonly pages?: Array<PageInfo>
}
/**
* Type of paginated unit in a document.
*
* Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
*/
export declare enum PageUnitType {
/** Standard document pages (PDF, DOCX, images) */
Page = "page",
/** Presentation slides (PPTX, ODP) */
Slide = "slide",
/** Spreadsheet sheets (XLSX, ODS) */
Sheet = "sheet",
}
/** A PDF annotation extracted from a document page. */
export interface PdfAnnotation {
/** The type of annotation. */
readonly annotationType: PdfAnnotationType
/** Text content of the annotation (e.g., comment text, link URL). */
readonly content?: string
/** Page number where the annotation appears (1-indexed). */
readonly pageNumber: number
/** Bounding box of the annotation on the page. */
readonly boundingBox?: BoundingBox
}
/** Type of PDF annotation. */
export declare enum PdfAnnotationType {
/** Sticky note / text annotation */
Text = "text",
/** Highlighted text region */
Highlight = "highlight",
/** Hyperlink annotation */
Link = "link",
/** Rubber stamp annotation */
Stamp = "stamp",
/** Underline text markup */
Underline = "underline",
/** Strikeout text markup */
StrikeOut = "strike_out",
/** Any other annotation type */
Other = "other",
}
/** PDF-specific configuration. */
export interface PdfConfig {
/** Extract images from PDF */
readonly extractImages?: boolean
/**
* Extract tables from PDF.
*
* When `true` (default), runs pdf_oxide's native grid detector and, if it
* finds nothing, falls back to the heuristic text-layer reconstruction in
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
* both passes — `tables` will then be empty in the result.
*/
readonly extractTables?: boolean
/** List of passwords to try when opening encrypted PDFs */
readonly passwords?: Array<string>
/** Extract PDF metadata */
readonly extractMetadata?: boolean
/** Hierarchy extraction configuration (None = hierarchy extraction disabled) */
readonly hierarchy?: HierarchyConfig
/**
* Extract PDF annotations (text notes, highlights, links, stamps).
* Default: false
*/
readonly extractAnnotations?: boolean
/**
* Top margin fraction (0.01.0) of page height to exclude headers/running heads.
* Default: 0.06 (6%)
*/
readonly topMarginFraction?: number
/**
* Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
* Default: 0.05 (5%)
*/
readonly bottomMarginFraction?: number
/**
* Allow single-column pseudo tables in extraction results.
*
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
* to 1, allowing single-column structured data (glossaries, itemized lists)
* to be emitted as tables. Other quality filters (density, sparsity, prose
* detection) still apply.
*/
readonly allowSingleColumnTables?: boolean
/**
* Perform OCR on inline images extracted from PDF pages and attach the
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
* falls back to `TesseractConfig::default()`. Per-image failures degrade
* gracefully (the image is returned without OCR text rather than failing
* the whole extraction). Default: `false`.
*/
readonly ocrInlineImages?: boolean
}
/**
* PDF-specific metadata.
*
* Contains metadata fields specific to PDF documents that are not in the common
* `Metadata` structure. Common fields like title, authors, keywords, and dates
* are at the `Metadata` level.
*/
export interface PdfMetadata {
/** PDF version (e.g., "1.7", "2.0") */
readonly pdfVersion?: string
/** PDF producer (application that created the PDF) */
readonly producer?: string
/** Whether the PDF is encrypted/password-protected */
readonly isEncrypted?: boolean
/** First page width in points (1/72 inch) */
readonly width?: number
/** First page height in points (1/72 inch) */
readonly height?: number
/** Total number of pages in the PDF document */
readonly pageCount?: number
}
/**
* Base trait that all plugins must implement.
*
* This trait provides common functionality for plugin lifecycle management,
* identification, and metadata.
*
* # Thread Safety
*
* All plugins must be `Send + Sync` to support concurrent usage across threads.
*/
export interface Plugin {
/**
* Returns the unique name/identifier for this plugin.
*
* The name should be:
* - Unique across all plugins
* - Lowercase with hyphens (e.g., "my-custom-plugin")
* - URL-safe characters only
*/
name(): string
/**
* Returns the semantic version of this plugin.
*
* Should follow semver format: `MAJOR.MINOR.PATCH`
*/
version?(): string
/**
* Initialize the plugin.
*
* Called once when the plugin is registered. Use this to:
* - Load configuration
* - Initialize resources (connections, caches, etc.)
* - Validate dependencies
*
* # Thread Safety
*
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
* Plugins needing mutable state during initialization should use interior mutability
* patterns (Mutex, RwLock, OnceCell, etc.).
* @throws Should return an error if initialization fails. The plugin will not be
* registered if this method returns an error.
*/
initialize?(): void
/**
* Shutdown the plugin.
*
* Called when the plugin is being unregistered or the application is shutting down.
* Use this to:
* - Close connections
* - Flush caches
* - Release resources
*
* # Thread Safety
*
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
* Plugins needing mutable state during shutdown should use interior mutability
* patterns (Mutex, RwLock, etc.).
* @throws Errors during shutdown are logged but don't prevent the shutdown process.
*/
shutdown?(): void
/**
* Optional plugin description for debugging and logging.
*
* Defaults to empty string if not overridden.
*/
description?(): string
/**
* Optional plugin author information.
*
* Defaults to empty string if not overridden.
*/
author?(): string
}
/**
* Trait for post-processor plugins.
*
* Post-processors transform or enrich extraction results after the initial
* extraction is complete. They can:
* - Clean and normalize text
* - Add metadata (language, keywords, entities)
* - Split content into chunks
* - Score quality
* - Apply custom transformations
*
* # Processing Order
*
* Post-processors are executed in stage order:
* 1. **Early** - Language detection, entity extraction
* 2. **Middle** - Keyword extraction, token reduction
* 3. **Late** - Custom hooks, final validation
*
* Within each stage, processors are executed in registration order.
*
* # Error Handling
*
* Post-processor errors are non-fatal by default - they're captured in metadata
* and execution continues. To make errors fatal, return an error from `process()`.
*
* # Thread Safety
*
* Post-processors must be thread-safe (`Send + Sync`).
*/
export interface PostProcessor {
name(): string
/**
* Process an extraction result.
*
* Transform or enrich the extraction result. Can modify:
* - `content` - The extracted text
* - `metadata` - Add or update metadata fields
* - `tables` - Modify or enhance table data
* @param result - Mutable reference to the extraction result to process
*
* @param config - Extraction configuration
*
* @returns `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
*
* @throws Return errors for fatal processing failures. Non-fatal errors should be
* captured in metadata directly on the result.
*
* # Performance
*
* This signature avoids unnecessary cloning of large extraction results by
* taking a mutable reference instead of ownership. Processors modify the
* result in place.
*
* # Example - Language Detection
*
* ```rust
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Detect language (simplified - use real detection library in practice)
* let language = "en"; // Placeholder detection
*
* // Add to metadata
* result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
*
* Ok(())
* }
* ```
*
* # Example - Text Cleaning
*
* ```rust
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Remove excessive whitespace
* result.content = result
* .content
* .split_whitespace()
* .collect::<Vec<_>>()
* .join(" ");
*
* Ok(())
* }
* ```
*/
process(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
/**
* Get the processing stage for this post-processor.
*
* Determines when this processor runs in the pipeline.
* @returns The `ProcessingStage` (Early, Middle, or Late).
*/
processingStage(): string
/**
* Optional: Check if this processor should run for a given result.
*
* Allows conditional processing based on MIME type, metadata, or content.
* Defaults to `true` (always run).
* @param result - The extraction result to check
*
* @param config - Extraction configuration
*
* @returns `true` if the processor should run, `false` to skip.
*/
shouldProcess?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
/**
* Optional: Estimate processing time in milliseconds.
*
* Used for logging and debugging. Defaults to 0 (unknown).
* @param result - The extraction result to estimate for
*
* @returns Estimated processing time in milliseconds.
*/
estimatedDurationMs?(result?: ExtractionResult | undefined | null): string
/**
* Execution priority within the processing stage.
*
* Higher values run first within the same `ProcessingStage`. Defaults to 50.
* Use 0-49 for fallback processors, 50 for normal processors, and 51-255
* for high-priority processors that should run early in their stage.
*/
priority?(): string
}
/** Post-processor configuration. */
export interface PostProcessorConfig {
/** Enable post-processors */
readonly enabled?: boolean
/** Whitelist of processor names to run (None = all enabled) */
readonly enabledProcessors?: Array<string>
/** Blacklist of processor names to skip (None = none disabled) */
readonly disabledProcessors?: Array<string>
/** Pre-computed AHashSet for O(1) enabled processor lookup */
readonly enabledSet?: Array<string>
/** Pre-computed AHashSet for O(1) disabled processor lookup */
readonly disabledSet?: Array<string>
}
/**
* Application properties from docProps/app.xml for PPTX
*
* Contains PowerPoint-specific document metadata.
*/
export interface PptxAppProperties {
/** Application name (e.g., "Microsoft Office PowerPoint") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Total editing time in minutes */
readonly totalTime?: number
/** Company name */
readonly company?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
/** Number of slides */
readonly slides?: number
/** Number of notes */
readonly notes?: number
/** Number of hidden slides */
readonly hiddenSlides?: number
/** Number of multimedia clips */
readonly multimediaClips?: number
/** Presentation format (e.g., "Widescreen", "Standard") */
readonly presentationFormat?: string
/** Slide titles */
readonly slideTitles?: Array<string>
}
/**
* PowerPoint (PPTX) extraction result.
*
* Contains extracted slide content, metadata, and embedded images/tables.
*/
export interface PptxExtractionResult {
/** Extracted text content from all slides */
readonly content: string
/** Presentation metadata */
readonly metadata: PptxMetadata
/** Total number of slides */
readonly slideCount: number
/** Total number of embedded images */
readonly imageCount: number
/** Total number of tables */
readonly tableCount: number
/** Extracted images from the presentation */
readonly images: Array<ExtractedImage>
/** Slide structure with boundaries (when page tracking is enabled) */
readonly pageStructure?: PageStructure
/** Per-slide content (when page tracking is enabled) */
readonly pageContents?: Array<PageContent>
/** Structured document representation */
readonly document?: DocumentStructure
/** Hyperlinks discovered in slides as (url, optional_label) pairs. */
readonly hyperlinks: Array<string>
/**
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
*
* Contains keys like "title", "author", "created_by", "subject", "keywords",
* "modified_by", "created_at", "modified_at", etc.
*/
readonly officeMetadata: Record<string, string>
/**
* Slide comments as revisions.
*
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
* `DocumentRevision { kind: Comment }` with author (resolved from
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
*/
readonly revisions?: Array<DocumentRevision>
}
/**
* PowerPoint presentation metadata.
*
* Extracted from PPTX files containing slide counts and presentation details.
*/
export interface PptxMetadata {
/** Total number of slides in the presentation */
readonly slideCount?: number
/** Names of slides (if available) */
readonly slideNames?: Array<string>
/** Number of embedded images */
readonly imageCount?: number
/** Number of tables */
readonly tableCount?: number
}
/**
* Processing stages for post-processors.
*
* Post-processors are executed in stage order (Early → Middle → Late).
* Use stages to control the order of post-processing operations.
*/
export declare enum ProcessingStage {
/**
* Early stage - foundational processing.
*
* Use for:
* - Language detection
* - Character encoding normalization
* - Entity extraction (NER)
* - Text quality scoring
*/
Early = "Early",
/**
* Middle stage - content transformation.
*
* Use for:
* - Keyword extraction
* - Token reduction
* - Text summarization
* - Semantic analysis
*/
Middle = "Middle",
/**
* Late stage - final enrichment.
*
* Use for:
* - Custom user hooks
* - Analytics/logging
* - Final validation
* - Output formatting
*/
Late = "Late",
}
/**
* A non-fatal warning from a processing pipeline stage.
*
* Captures errors from optional features that don't prevent extraction
* but may indicate degraded results.
*/
export interface ProcessingWarning {
/**
* The pipeline stage or feature that produced this warning
* (e.g., "embedding", "chunking", "language_detection", "output_format").
*/
readonly source: string
/** Human-readable description of what went wrong. */
readonly message: string
}
/** Page Segmentation Mode for Tesseract OCR */
export declare enum PSMMode {
OsdOnly = "OsdOnly",
AutoOsd = "AutoOsd",
AutoOnly = "AutoOnly",
Auto = "Auto",
SingleColumn = "SingleColumn",
SingleBlockVertical = "SingleBlockVertical",
SingleBlock = "SingleBlock",
SingleLine = "SingleLine",
SingleWord = "SingleWord",
CircleWord = "CircleWord",
SingleChar = "SingleChar",
}
/** Outlook PST archive metadata. */
export interface PstMetadata {
readonly messageCount?: number
}
/** RAKE-specific parameters. */
export interface RakeParams {
/** Minimum word length to consider (default: 1). */
readonly minWordLength?: number
/** Maximum words in a keyword phrase (default: 3). */
readonly maxWordsPerPhrase?: number
}
/**
* Pre-computed table markdown for a table detection region.
*
* Produced by the TATR-based table structure recognizer and surfaced as part of
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
* so that consumers who do not enable `layout-detection` (ORT) can still reference
* the type in their own code.
*/
export interface RecognizedTable {
/** Detection bbox that this table corresponds to (for matching). */
readonly detectionBbox: BBox
/** Table cells as a 2D vector (rows × columns). */
readonly cells: Array<Array<string>>
/** Rendered markdown table. */
readonly markdown: string
}
export declare enum ReductionLevel {
Off = "Off",
Light = "Light",
Moderate = "Moderate",
Aggressive = "Aggressive",
Maximum = "Maximum",
}
/** Semantic kind of a relationship between document elements. */
export declare enum RelationshipKind {
/** Footnote marker -> footnote definition. */
FootnoteReference = "footnote_reference",
/** Citation marker -> bibliography entry. */
CitationReference = "citation_reference",
/** Internal anchor link (`#id`) -> target heading/element. */
InternalLink = "internal_link",
/** Caption paragraph -> figure/table it describes. */
Caption = "caption",
/** Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`). */
Label = "label",
/** TOC entry -> target section. */
TocEntry = "toc_entry",
/** Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field). */
CrossReference = "cross_reference",
}
/**
* Trait for document renderers that convert [`InternalDocument`] to output strings.
*
* Renderers are typically stateless converters that transform the internal
* document representation into a specific output format (Markdown, HTML,
* Djot, plain text, etc.). They participate in the standard [`Plugin`]
* lifecycle so custom renderers can be registered from any supported binding
* language.
*
* The format name is exposed via [`Plugin::name`]. For stateless renderers
* the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
* take no-op defaults and need not be overridden.
*
* # Thread Safety
*
* Renderers must be `Send + Sync` (inherited from [`Plugin`]).
*/
export interface Renderer {
name(): string
/**
* Render an [`InternalDocument`] to the output format.
* @param doc - The internal document to render
*
* @returns The rendered output as a string.
*
* @throws Returns an error if rendering fails.
*/
render(doc: InternalDocument): string
}
/**
* Result-shape selection for extraction results.
*
* Distinct from `OutputFormat` (which controls rendering — Plain, Markdown,
* HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
* blob vs. an element-based decomposition.
*/
export declare enum ResultFormat {
/** Unified format with all content in `content` field */
Unified = "unified",
/** Element-based format with semantic element extraction */
ElementBased = "element_based",
}
/** Best-effort document location for a revision. */
export type RevisionAnchor =
| { type: 'paragraph'; index: number }
| { type: 'table_cell'; row: number; col: number; tableIndex: number }
| { type: 'page'; index: number }
| { type: 'slide'; index: number }
| { type: 'sheet'; index: number; name: string }
/**
* The content changes that make up a single revision.
*
* For insertions and deletions the `content` field carries the added/removed
* lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
* changes, `content` is empty — the property diff is left as a TODO for a
* later enrichment pass.
*/
export interface RevisionDelta {
/** Line-level content changes for this revision. */
readonly content?: Array<DiffLine>
/** Cell-level table changes for this revision. */
readonly tableChanges?: Array<CellChange>
}
/** Semantic classification of a tracked change. */
export declare enum RevisionKind {
/** Text or content was inserted. */
Insertion = "insertion",
/** Text or content was deleted. */
Deletion = "deletion",
/** Run-level formatting (font, size, colour, …) was changed. */
FormatChange = "format_change",
/** A reviewer comment or annotation. */
Comment = "comment",
}
/**
* Configuration for security limits across extractors.
*
* All limits are intentionally conservative to prevent DoS attacks
* while still supporting legitimate documents.
*/
export interface SecurityLimits {
/** Maximum uncompressed size for archives (500 MB) */
readonly maxArchiveSize?: number
/** Maximum compression ratio before flagging as potential bomb (100:1) */
readonly maxCompressionRatio?: number
/** Maximum number of files in archive (10,000) */
readonly maxFilesInArchive?: number
/** Maximum nesting depth for structures (100) */
readonly maxNestingDepth?: number
/**
* Maximum length of any single XML entity / attribute / token (1 MiB).
* This is a per-token cap, NOT a total cap — billion-laughs class
* attacks where a single entity expands to hundreds of MB are caught
* here, while normal long text content (a paragraph, a CDATA block) is
* caught by `max_content_size` instead.
*/
readonly maxEntityLength?: number
/** Maximum string growth per document (100 MB) */
readonly maxContentSize?: number
/** Maximum iterations per operation */
readonly maxIterations?: number
/** Maximum XML depth (100 levels) */
readonly maxXmlDepth?: number
/** Maximum cells per table (100,000) */
readonly maxTableCells?: number
}
/**
* API server configuration.
*
* This struct holds all configuration options for the Kreuzberg API server,
* including host/port settings, CORS configuration, and upload limits.
*
* # Defaults
*
* - `host`: "127.0.0.1" (localhost only)
* - `port`: 8000
* - `cors_origins`: empty vector (allows all origins)
* - `max_request_body_bytes`: 104_857_600 (100 MB)
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
*/
export interface ServerConfig {
/** Server host address (e.g., "127.0.0.1", "0.0.0.0") */
readonly host?: string
/** Server port number */
readonly port?: number
/**
* CORS allowed origins. Empty vector means allow all origins.
*
* If this is an empty vector, the server will accept requests from any origin.
* If populated with specific origins (e.g., `"https://example.com"`), only
* those origins will be allowed.
*/
readonly corsOrigins?: Array<string>
/** Maximum size of request body in bytes (default: 100 MB) */
readonly maxRequestBodyBytes?: number
/** Maximum size of multipart fields in bytes (default: 100 MB) */
readonly maxMultipartFieldBytes?: number
}
/** Structured data (Schema.org, microdata, RDFa) block. */
export interface StructuredData {
/** Type of structured data */
readonly dataType: StructuredDataType
/** Raw JSON string representation */
readonly rawJson: string
/** Schema type if detectable (e.g., "Article", "Event", "Product") */
readonly schemaType?: string
}
export interface StructuredDataResult {
readonly content: string
readonly format: string
readonly metadata: Record<string, string>
readonly textFields: Array<string>
}
/** Structured data type classification. */
export declare enum StructuredDataType {
/** JSON-LD structured data */
JsonLd = "json-ld",
/** Microdata */
Microdata = "microdata",
/** RDFa */
RDFa = "rdfa",
}
/**
* Configuration for LLM-based structured data extraction.
*
* Sends extracted document content to a VLM with a JSON schema,
* returning structured data that conforms to the schema.
* @example
* ```typescript
* [structured_extraction]
* schema_name = "invoice_data"
* strict = true
*
* [structured_extraction.schema]
* type = "object"
* properties.vendor = { type = "string" }
* properties.total = { type = "number" }
* required = ["vendor", "total"]
*
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* ```typescript
*/
export interface StructuredExtractionConfig {
/** JSON Schema defining the desired output structure. */
readonly schema: JsonValue
/** Schema name passed to the LLM's structured output mode. */
readonly schemaName: string
/** Optional schema description for the LLM. */
readonly schemaDescription?: string
/** Enable strict mode — output must exactly match the schema. */
readonly strict: boolean
/**
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
*
* Available template variables:
* - `{{ content }}` — The extracted document text.
* - `{{ schema }}` — The JSON schema as a formatted string.
* - `{{ schema_name }}` — The schema name.
* - `{{ schema_description }}` — The schema description (may be empty).
*/
readonly prompt?: string
/** LLM configuration for the extraction. */
readonly llm: LlmConfig
}
/**
* A supported document format entry.
*
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
*/
export interface SupportedFormat {
/** File extension (without leading dot), e.g., "pdf", "docx" */
readonly extension: string
/** MIME type string, e.g., "application/pdf" */
readonly mimeType: string
}
/**
* Extracted table structure.
*
* Represents a table detected and extracted from a document (PDF, image, etc.).
* Tables are converted to both structured cell data and Markdown format.
*/
export interface Table {
/** Table cells as a 2D vector (rows × columns) */
readonly cells?: Array<Array<string>>
/** Markdown representation of the table */
readonly markdown?: string
/** Page number where the table was found (1-indexed) */
readonly pageNumber?: number
/**
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted tables when position data is available.
*/
readonly boundingBox?: BoundingBox
}
/**
* Individual table cell with content and optional styling.
*
* Future extension point for rich table support with cell-level metadata.
*/
export interface TableCell {
/** Cell content as text */
readonly content?: string
/** Row span (number of rows this cell spans) */
readonly rowSpan?: number
/** Column span (number of columns this cell spans) */
readonly colSpan?: number
/** Whether this is a header cell */
readonly isHeader?: boolean
}
/** Cell-level changes for a pair of tables that share the same index. */
export interface TableDiff {
/** Zero-based index of the table in both `a.tables` and `b.tables`. */
readonly fromIndex: number
/** Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
readonly toIndex: number
/** Cell-level changes within the table. */
readonly cellChanges: Array<CellChange>
}
/**
* Structured table grid with cell-level metadata.
*
* Stores row/column dimensions and a flat list of cells with position info.
*/
export interface TableGrid {
/** Number of rows in the table. */
readonly rows?: number
/** Number of columns in the table. */
readonly cols?: number
/** All cells in row-major order. */
readonly cells?: Array<GridCell>
}
/**
* Which table structure recognition model to use.
*
* Controls the model used for table cell detection within layout-detected
* table regions. Wire format is snake_case in all serializers (JSON, TOML,
* YAML).
*/
export declare enum TableModel {
/** TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection. */
Tatr = "tatr",
/** SLANeXT wired variant -- 365MB, optimized for bordered tables. */
SlanetWired = "slanet_wired",
/** SLANeXT wireless variant -- 365MB, optimized for borderless tables. */
SlanetWireless = "slanet_wireless",
/** SLANet-plus -- 7.78MB, lightweight general-purpose. */
SlanetPlus = "slanet_plus",
/**
* Classifier-routed SLANeXT: auto-select wired/wireless per table.
* Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
*/
SlanetAuto = "slanet_auto",
/** Disable table structure model inference entirely; use heuristic path only. */
Disabled = "disabled",
}
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
export interface TesseractConfig {
/** Language code (e.g., "eng", "deu", "fra") */
readonly language?: string
/**
* Page Segmentation Mode (0-13).
*
* Common values:
* - 3: Fully automatic page segmentation (native default)
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
* - 11: Sparse text with no particular order
*/
readonly psm?: number
/** Output format ("text" or "markdown") */
readonly outputFormat?: string
/**
* OCR Engine Mode (0-3).
*
* - 0: Legacy engine only
* - 1: Neural nets (LSTM) only (usually best)
* - 2: Legacy + LSTM
* - 3: Default (based on what's available)
*/
readonly oem?: number
/**
* Minimum confidence threshold (0.0-100.0).
*
* Words with confidence below this threshold may be rejected or flagged.
*/
readonly minConfidence?: number
/**
* Image preprocessing configuration.
*
* Controls how images are preprocessed before OCR. Can significantly
* improve quality for scanned documents or low-quality images.
*/
readonly preprocessing?: ImagePreprocessingConfig
/** Enable automatic table detection and reconstruction */
readonly enableTableDetection?: boolean
/** Minimum confidence threshold for table detection (0.0-1.0) */
readonly tableMinConfidence?: number
/** Column threshold for table detection (pixels) */
readonly tableColumnThreshold?: number
/** Row threshold ratio for table detection (0.0-1.0) */
readonly tableRowThresholdRatio?: number
/** Enable OCR result caching */
readonly useCache?: boolean
/** Use pre-adapted templates for character classification */
readonly classifyUsePreAdaptedTemplates?: boolean
/** Enable N-gram language model */
readonly languageModelNgramOn?: boolean
/** Don't reject good words during block-level processing */
readonly tesseditDontBlkrejGoodWds?: boolean
/** Don't reject good words during row-level processing */
readonly tesseditDontRowrejGoodWds?: boolean
/** Enable dictionary correction */
readonly tesseditEnableDictCorrection?: boolean
/** Whitelist of allowed characters (empty = all allowed) */
readonly tesseditCharWhitelist?: string
/** Blacklist of forbidden characters (empty = none forbidden) */
readonly tesseditCharBlacklist?: string
/** Use primary language params model */
readonly tesseditUsePrimaryParamsModel?: boolean
/** Variable-width space detection */
readonly textordSpaceSizeIsVariable?: boolean
/** Use adaptive thresholding method */
readonly thresholdingMethod?: boolean
}
/**
* Inline text annotation — byte-range based formatting and links.
*
* Annotations reference byte offsets into the node's text content,
* enabling precise identification of formatted regions.
*/
export interface TextAnnotation {
/** Start byte offset in the node's text content (inclusive). */
readonly start: number
/** End byte offset in the node's text content (exclusive). */
readonly end: number
/** Annotation type. */
readonly kind: AnnotationKind
}
/** Text direction enumeration for HTML documents. */
export declare enum TextDirection {
/** Left-to-right text direction */
LeftToRight = "ltr",
/** Right-to-left text direction */
RightToLeft = "rtl",
/** Automatic text direction detection */
Auto = "auto",
}
/**
* Plain text and Markdown extraction result.
*
* Contains the extracted text along with statistics and,
* for Markdown files, structural elements like headers and links.
*/
export interface TextExtractionResult {
/** Extracted text content */
readonly content: string
/** Number of lines */
readonly lineCount: number
/** Number of words */
readonly wordCount: number
/** Number of characters */
readonly characterCount: number
/** Markdown headers (text only, Markdown files only) */
readonly headers?: Array<string>
/** Markdown links as (text, URL) tuples (Markdown files only) */
readonly links?: Array<Array<string>>
/** Code blocks as (language, code) tuples (Markdown files only) */
readonly codeBlocks?: Array<Array<string>>
}
/**
* Text/Markdown metadata.
*
* Extracted from plain text and Markdown files. Includes word counts and,
* for Markdown, structural elements like headers and links.
*/
export interface TextMetadata {
/** Number of lines in the document */
readonly lineCount?: number
/** Number of words */
readonly wordCount?: number
/** Number of characters */
readonly characterCount?: number
/** Markdown headers (headings text only, for Markdown files) */
readonly headers?: Array<string>
/** Markdown links as (text, url) tuples (for Markdown files) */
readonly links?: Array<Array<string>>
/** Code blocks as (language, code) tuples (for Markdown files) */
readonly codeBlocks?: Array<Array<string>>
}
export interface TokenReductionConfig {
readonly level?: ReductionLevel
readonly languageHint?: string
readonly preserveMarkdown?: boolean
readonly preserveCode?: boolean
readonly semanticThreshold?: number
readonly enableParallel?: boolean
readonly useSimd?: boolean
readonly customStopwords?: Record<string, Array<string>>
readonly preservePatterns?: Array<string>
readonly targetReduction?: number
readonly enableSemanticClustering?: boolean
}
/** Token reduction configuration. */
export interface TokenReductionOptions {
/** Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
readonly mode?: string
/** Preserve important words (capitalized, technical terms) */
readonly preserveImportantWords?: boolean
}
/**
* Configuration for tree-sitter language pack integration.
*
* Controls grammar download behavior and code analysis options.
*
* # Example (TOML)
*
* ```toml
* [tree_sitter]
* languages = ["python", "rust"]
* groups = ["web"]
*
* [tree_sitter.process]
* structure = true
* comments = true
* docstrings = true
* ```
*/
export interface TreeSitterConfig {
/**
* Enable code intelligence processing (default: true).
*
* When `false`, tree-sitter analysis is completely skipped even if
* the config section is present.
*/
readonly enabled?: boolean
/**
* Custom cache directory for downloaded grammars.
*
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
*/
readonly cacheDir?: string
/** Languages to pre-download on init (e.g., `["python", "rust"]`). */
readonly languages?: Array<string>
/** Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
readonly groups?: Array<string>
/** Processing options for code analysis. */
readonly process?: TreeSitterProcessConfig
}
/**
* Processing options for tree-sitter code analysis.
*
* Controls which analysis features are enabled when extracting code files.
*/
export interface TreeSitterProcessConfig {
/** Extract structural items (functions, classes, structs, etc.). Default: true. */
readonly structure?: boolean
/** Extract import statements. Default: true. */
readonly imports?: boolean
/** Extract export statements. Default: true. */
readonly exports?: boolean
/** Extract comments. Default: false. */
readonly comments?: boolean
/** Extract docstrings. Default: false. */
readonly docstrings?: boolean
/** Extract symbol definitions. Default: false. */
readonly symbols?: boolean
/** Include parse diagnostics. Default: false. */
readonly diagnostics?: boolean
/** Maximum chunk size in bytes. `None` disables chunking. */
readonly chunkMaxSize?: number
/** Content rendering mode for code extraction. */
readonly contentMode?: CodeContentMode
}
/** Semantic classification of an extracted URI. */
export declare enum UriKind {
/** A clickable hyperlink (web URL, file link). */
Hyperlink = "hyperlink",
/** An image or media resource reference. */
Image = "image",
/** An internal anchor or cross-reference target. */
Anchor = "anchor",
/** A citation or bibliographic reference (DOI, academic ref). */
Citation = "citation",
/** A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST). */
Reference = "reference",
/** An email address (`mailto:` link or bare email). */
Email = "email",
}
/**
* Trait for validator plugins.
*
* Validators check extraction results for quality, completeness, or correctness.
* Unlike post-processors, validator errors **fail fast** - if a validator returns
* an error, the extraction fails immediately.
*
* # Use Cases
*
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
* - **Compliance**: Verify content meets regulatory requirements
* - **Content Filtering**: Reject documents containing unwanted content
* - **Format Validation**: Verify extracted content structure
* - **Security Checks**: Scan for malicious content
*
* # Error Handling
*
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
* to the caller. Use validators for hard requirements that must be met.
*
* For non-fatal checks, use post-processors instead.
*
* # Thread Safety
*
* Validators must be thread-safe (`Send + Sync`).
*/
export interface Validator {
name(): string
/**
* Validate an extraction result.
*
* Check the extraction result and return `Ok(())` if valid, or an error
* if validation fails.
* @param result - The extraction result to validate
*
* @param config - Extraction configuration
*
* @returns - `Ok(())` if validation passes
* - `Err(...)` if validation fails (extraction will fail)
*
* @throws - `KreuzbergError::Validation` - Validation failed
* - Any other error type appropriate for the failure
*
* # Example - Content Length Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* let length = result.content.len();
*
* if length < self.min {
* return Err(KreuzbergError::validation(format!(
* "Content too short: {} < {} characters",
* length, self.min
* )));
* }
*
* if length > self.max {
* return Err(KreuzbergError::validation(format!(
* "Content too long: {} > {} characters",
* length, self.max
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Quality Score Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check if quality_score exists in metadata
* let score = result.metadata
* .additional
* .get("quality_score")
* .and_then(|v| v.as_f64())
* .unwrap_or(0.0);
*
* if score < self.min_score {
* return Err(KreuzbergError::validation(format!(
* "Quality score too low: {} < {}",
* score, self.min_score
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Security Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check for blocked patterns
* for pattern in &self.blocked_patterns {
* if result.content.contains(pattern) {
* return Err(KreuzbergError::validation(format!(
* "Content contains blocked pattern: {}",
* pattern
* )));
* }
* }
*
* Ok(())
* }
* ```
*/
validate(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
/**
* Optional: Check if this validator should run for a given result.
*
* Allows conditional validation based on MIME type, metadata, or content.
* Defaults to `true` (always run).
* @param result - The extraction result to check
*
* @param config - Extraction configuration
*
* @returns `true` if the validator should run, `false` to skip.
*/
shouldValidate?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
/**
* Optional: Get the validation priority.
*
* Higher priority validators run first. Useful for ordering validation checks
* (e.g., run cheap validations before expensive ones).
*
* Default priority is 50.
* @returns Priority value (higher = runs earlier).
*/
priority?(): string
}
/**
* Application properties from docProps/app.xml for XLSX
*
* Contains Excel-specific document metadata.
*/
export interface XlsxAppProperties {
/** Application name (e.g., "Microsoft Excel") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
/** Company name */
readonly company?: string
/** Worksheet names */
readonly worksheetNames?: Array<string>
}
/**
* XML extraction result.
*
* Contains extracted text content from XML files along with
* structural statistics about the XML document.
*/
export interface XmlExtractionResult {
/** Extracted text content (XML structure filtered out) */
readonly content: string
/** Total number of XML elements processed */
readonly elementCount: number
/** List of unique element names found (sorted) */
readonly uniqueElements: Array<string>
}
/**
* XML metadata extracted during XML parsing.
*
* Provides statistics about XML document structure.
*/
export interface XmlMetadata {
/** Total number of XML elements processed */
readonly elementCount?: number
/** List of unique element tag names (sorted) */
readonly uniqueElements?: Array<string>
}
/** YAKE-specific parameters. */
export interface YakeParams {
/**
* Window size for co-occurrence analysis (default: 2).
*
* Controls the context window for computing co-occurrence statistics.
*/
readonly windowSize?: number
}
/** Year range for bibliographic metadata. */
export interface YearRange {
readonly min?: number
readonly max?: number
readonly years: Array<number>
}
/** List names of all registered document extractors. */
export declare function listDocumentExtractors(): Array<string>;
/**
* List the names of all registered embedding backends.
*
* Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
* bindings.
*/
export declare function listEmbeddingBackends(): Array<string>;
/**
* List the names of all available embedding presets.
*
* Returns owned `String`s so the values are safe to pass across FFI boundaries.
*/
export declare function listEmbeddingPresets(): Array<string>;
/**
* List all registered OCR backends.
*
* Returns the names of all OCR backends currently registered in the global registry.
* @returns A vector of OCR backend names.
*/
export declare function listOcrBackends(): Array<string>;
/**
* List all registered post-processor names.
*
* Returns a vector of all post-processor names currently registered in the
* global registry.
* @returns - `Ok(Vec<String>)` - Vector of post-processor names
* - `Err(...)` if the registry lock is poisoned
*/
export declare function listPostProcessors(): Array<string>;
/**
* List names of all registered renderers.
* @throws Returns an error if the registry lock is poisoned.
*/
export declare function listRenderers(): Array<string>;
/** List names of all registered validators. */
export declare function listValidators(): Array<string>;
export declare function registerDocumentExtractor(impl: DocumentExtractor): void;
export declare function registerEmbeddingBackend(impl: EmbeddingBackend): void;
export declare function registerOcrBackend(impl: OcrBackend): void;
export declare function registerPostProcessor(impl: PostProcessor): void;
export declare function registerRenderer(impl: Renderer): void;
export declare function registerValidator(impl: Validator): void;
/**
* Render a single PDF page to PNG bytes.
*
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
* @param pdf_bytes - Raw PDF file bytes
*
* @param page_index - Zero-based page index
*
* @param dpi - Resolution in dots per inch (default: 150)
*
* @param password - Optional password for encrypted PDFs
*
* @throws Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
* or rendered, or if `page_index` is out of range.
*/
export declare function renderPdfPageToPng(pdfBytes: Uint8Array, pageIndex: number, dpi?: number | undefined | null, password?: string | undefined | null): Uint8Array;
export declare function unregisterDocumentExtractor(name: string): void;
export declare function unregisterEmbeddingBackend(name: string): void;
export declare function unregisterOcrBackend(name: string): void;
export declare function unregisterPostProcessor(name: string): void;
export declare function unregisterRenderer(name: string): void;
export declare function unregisterValidator(name: string): void;