Files
fil/crates/kreuzberg-node/index.d.ts

5489 lines
189 KiB
TypeScript
Raw Normal View History

2026-06-01 23:40:55 +02:00
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
/* eslint-disable */
export type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue };
/**
* Extract content from multiple byte arrays concurrently.
*
* This function processes multiple byte arrays in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
* to `(num_cpus * 1.5).ceil()`.
*
* Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
* fields from the batch-level `config`. Pass `None` as the config to use
* the batch-level defaults for that item.
* @param items - Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
*
* @param config - Batch-level extraction configuration
*
* @returns A vector of `ExtractionResult` in the same order as the input items.
*/
export declare function batchExtractBytes(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
/**
* Synchronous wrapper for `batch_extract_bytes`.
*
* Uses the global Tokio runtime for optimal performance.
* With the `tokio-runtime` feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
* that iterates through items and calls `extract_bytes_sync()`.
*/
export declare function batchExtractBytesSync(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
/**
* Extract content from multiple files concurrently.
*
* This function processes multiple files in parallel, automatically managing
* concurrency to prevent resource exhaustion. The concurrency limit can be
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
* to `(num_cpus * 1.5).ceil()`.
*
* Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
* fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
* Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
* taken from the batch-level `config`.
* @param items - Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
*
* @param config - Batch-level extraction configuration (provides defaults and batch settings)
*
* @returns A vector of `ExtractionResult` in the same order as the input items.
*
* @throws Individual file errors are captured in the result metadata. System errors
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
*/
export declare function batchExtractFiles(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
/**
* Synchronous wrapper for `batch_extract_files`.
*
* Uses the global Tokio runtime for optimal performance.
* Only available with `tokio-runtime` (WASM has no filesystem).
*/
export declare function batchExtractFilesSync(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
/**
* Clear all document extractors from the global registry.
*
* Calls `shutdown()` on every registered extractor, then empties the registry.
* @throws - Any error returned by an extractor's `shutdown()` method. The first error
* encountered stops processing of remaining extractors.
*/
export declare function clearDocumentExtractors(): void;
/**
* Clear all embedding backends from the global registry.
*
* Calls `shutdown()` on every registered backend, then empties the registry.
* @throws - Any error returned by a backend's `shutdown()` method. The first error
* encountered stops processing of remaining backends.
*/
export declare function clearEmbeddingBackends(): void;
/**
* Clear all OCR backends from the global registry.
*
* Removes all OCR backends and calls their `shutdown()` methods.
* @returns - `Ok(())` if all backends were cleared successfully
* - `Err(...)` if any shutdown method failed
*/
export declare function clearOcrBackends(): void;
/** Remove all registered post-processors. */
export declare function clearPostProcessors(): void;
/**
* Clear all renderers from the global registry.
*
* Removes every renderer, including the built-in defaults (markdown, html,
* djot, plain). After calling this no renderers are registered; re-register
* as needed.
* @throws Returns an error if the registry lock is poisoned.
*/
export declare function clearRenderers(): void;
/** Remove all registered validators. */
export declare function clearValidators(): void;
/**
* Compare two extraction results and return a structured diff.
*
* The comparison is purely structural no I/O, no side effects. All fields
* of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
* @param a - the "before" extraction result
*
* @param b - the "after" extraction result
*
* @param opts - controls which sections are compared and optional truncation
*/
export declare function compare(a?: ExtractionResult | undefined | null, b?: ExtractionResult | undefined | null, opts?: DiffOptions | undefined | null): ExtractionDiff;
/**
* Detect the MIME type of a file at the given path.
*
* Uses the file extension and optionally the file content to determine the MIME type.
* Set `check_exists` to `true` to verify the file exists before detection.
*/
export declare function detectMimeType(path: string, checkExists: boolean): string;
/**
* Detect MIME type from raw file bytes.
*
* Uses magic byte signatures to detect file type from content.
* Falls back to `infer` crate for comprehensive detection.
*
* For ZIP-based files, inspects contents to distinguish Office Open XML
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
* @param content - Raw file bytes
*
* @returns The detected MIME type string.
*
* @throws Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
*/
export declare function detectMimeTypeFromBytes(content: Uint8Array): string;
/**
* Embed a list of texts using the configured embedding model.
*
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
*/
export declare function embedTexts(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Array<Array<number>>;
/**
* Generate embeddings asynchronously for a list of text strings.
*
* This is the async counterpart to [`embed_texts`]. It offloads the blocking
* ONNX inference work to a dedicated blocking thread pool via Tokio's
* `spawn_blocking`, keeping the async executor free.
*
* Returns one embedding vector per input text in the same order.
* @param texts - Vec of strings to embed (owned, sent to blocking thread)
*
* @param config - Embedding configuration specifying model, batch size, and normalization
*
* @throws - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
* - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
* or the blocking inference task panics
*/
export declare function embedTextsAsync(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Promise<Array<Array<number>>>;
/**
* Extract content from a byte array.
*
* This is the main entry point for in-memory extraction. It performs the following steps:
* 1. Validate MIME type
* 2. Handle legacy format conversion if needed
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* @param content - The byte array to extract
*
* @param mime_type - MIME type of the content
*
* @param config - Extraction configuration
*
* @returns An `ExtractionResult` containing the extracted content and metadata.
*
* @throws Returns `KreuzbergError::Validation` if MIME type is invalid.
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
*/
export declare function extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
/**
* Synchronous wrapper for `extract_bytes`.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call.
*
* With the `tokio-runtime` feature, this blocks the current thread using the global
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
*/
export declare function extractBytesSync(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): ExtractionResult;
/**
* Extract content from a file.
*
* This is the main entry point for file-based extraction. It performs the following steps:
* 1. Check cache for existing result (if caching enabled)
* 2. Detect or validate MIME type
* 3. Select appropriate extractor from registry
* 4. Extract content
* 5. Run post-processing pipeline
* 6. Store result in cache (if caching enabled)
* @param path - Path to the file to extract
*
* @param mime_type - Optional MIME type override. If None, will be auto-detected
*
* @param config - Extraction configuration
*
* @returns An `ExtractionResult` containing the extracted content and metadata.
*
* @throws Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
*/
export declare function extractFile(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
/**
* Synchronous wrapper for `extract_file`.
*
* This is a convenience function that blocks the current thread until extraction completes.
* For async code, use `extract_file` directly.
*
* Uses the global Tokio runtime for 100x+ performance improvement over creating
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
*
* This function is only available with the `tokio-runtime` feature. For WASM targets,
* use a truly synchronous extraction approach instead.
*/
export declare function extractFileSync(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): ExtractionResult;
/**
* Get an embedding preset by name.
*
* Returns `None` if no preset with the given name exists. Returns an owned
* clone so the value is safe to pass across FFI boundaries.
*/
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
/**
* Get file extensions for a given MIME type.
*
* Returns all known file extensions that map to the specified MIME type.
* @param mime_type - The MIME type to look up
*
* @returns A vector of file extensions (without leading dot) for the MIME type.
*/
export declare function getExtensionsForMime(mimeType: string): Array<string>;
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*/
export interface AccelerationConfig {
/** Execution provider to use for ONNX inference. */
readonly provider?: ExecutionProviderType
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
readonly deviceId?: number
}
/** Types of inline text annotations. */
export type AnnotationKind =
| { annotation_type: 'bold' }
| { annotation_type: 'italic' }
| { annotation_type: 'underline' }
| { annotation_type: 'strikethrough' }
| { annotation_type: 'code' }
| { annotation_type: 'subscript' }
| { annotation_type: 'superscript' }
| { annotation_type: 'link'; url: string; title: string }
| { annotation_type: 'highlight' }
| { annotation_type: 'color'; value: string }
| { annotation_type: 'font_size'; value: string }
| { annotation_type: 'custom'; name: string; value: string }
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
export interface ArchiveEntry {
/** Archive-relative file path (e.g. "folder/document.pdf"). */
readonly path: string
/** Detected MIME type of the file. */
readonly mimeType: string
/** Full extraction result for this file. */
readonly result: ExtractionResult
}
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
export interface ArchiveMetadata {
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
readonly format?: string
/** Total number of files in the archive */
readonly fileCount?: number
/** List of file paths within the archive */
readonly fileList?: Array<string>
/** Total uncompressed size in bytes */
readonly totalSize?: number
/** Compressed size in bytes (if available) */
readonly compressedSize?: number
}
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
export interface BatchBytesItem {
/** The content bytes to extract from */
readonly content: Uint8Array
/** MIME type of the content (e.g., "application/pdf", "text/html") */
readonly mimeType: string
/** Per-item configuration overrides (None uses batch-level defaults) */
readonly config?: FileExtractionConfig
}
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
export interface BatchFileItem {
/** Path to the file to extract from */
readonly path: string
/** Per-file configuration overrides (None uses batch-level defaults) */
readonly config?: FileExtractionConfig
}
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
export interface BBox {
readonly x1: number
readonly y1: number
readonly x2: number
readonly y2: number
}
/** BibTeX bibliography metadata. */
export interface BibtexMetadata {
/** Number of entries in the bibliography. */
readonly entryCount?: number
readonly citationKeys?: Array<string>
readonly authors?: Array<string>
readonly yearRange?: YearRange
readonly entryTypes?: Record<string, number>
}
/** Types of block-level elements in Djot. */
export declare enum BlockType {
Paragraph = "paragraph",
Heading = "heading",
Blockquote = "blockquote",
CodeBlock = "code_block",
ListItem = "list_item",
OrderedList = "ordered_list",
BulletList = "bullet_list",
TaskList = "task_list",
DefinitionList = "definition_list",
DefinitionTerm = "definition_term",
DefinitionDescription = "definition_description",
Div = "div",
Section = "section",
ThematicBreak = "thematic_break",
RawBlock = "raw_block",
MathDisplay = "math_display",
}
/** Bounding box coordinates for element positioning. */
export interface BoundingBox {
/** Left x-coordinate */
readonly x0?: number
/** Bottom y-coordinate */
readonly y0?: number
/** Right x-coordinate */
readonly x1?: number
/** Top y-coordinate */
readonly y1?: number
}
export interface CacheStats {
readonly totalFiles: number
readonly totalSizeMb: number
readonly availableSpaceMb: number
readonly oldestFileAgeDays: number
readonly newestFileAgeDays: number
}
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
export interface CellChange {
/** Zero-based row index. */
readonly row: number
/** Zero-based column index. */
readonly col: number
/** Value before the change. */
readonly from: string
/** Value after the change. */
readonly to: string
}
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
export interface Chunk {
/** The text content of this chunk. */
readonly content: string
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
*/
readonly chunkType: ChunkType
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*/
readonly embedding?: Array<number>
/** Metadata about this chunk's position and properties. */
readonly metadata: ChunkMetadata
}
/**
* Type of text chunker to use.
*
* # Variants
*
* * `Text` - Generic text splitter, splits on whitespace and punctuation
* * `Markdown` - Markdown-aware splitter, preserves formatting and structure
* * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* `max_characters` (default 1000). `topic_threshold` has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
export declare enum ChunkerType {
Text = "text",
Markdown = "markdown",
Yaml = "yaml",
Semantic = "semantic",
}
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..Default::default()` when constructing to allow for future field additions:
* ```rust
* let config = ChunkingConfig {
* max_characters: 500,
* ..Default::default()
* };
* ```
*/
export interface ChunkingConfig {
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*/
readonly maxCharacters?: number
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*/
readonly overlap?: number
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*/
readonly trim?: boolean
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*/
readonly chunkerType?: ChunkerType
/** Optional embedding configuration for chunk embeddings. */
readonly embedding?: EmbeddingConfig
/** Use a preset configuration (overrides individual settings if provided). */
readonly preset?: string
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*/
readonly sizing?: ChunkSizing
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*/
readonly prependHeadingContext?: boolean
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*/
readonly topicThreshold?: number
}
/** Metadata about a chunk's position in the original document. */
export interface ChunkMetadata {
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
readonly byteStart: number
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
readonly byteEnd: number
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*/
readonly tokenCount?: number
/** Zero-based index of this chunk in the document. */
readonly chunkIndex: number
/** Total number of chunks in the document. */
readonly totalChunks: number
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
readonly firstPage?: number
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*/
readonly lastPage?: number
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType::Markdown` is used.
*/
readonly headingContext?: HeadingContext
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*/
readonly imageIndices: Array<number>
}
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
export type ChunkSizing =
| { type: 'characters' }
| { type: 'tokenizer'; model: string; cacheDir: string }
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in `chunking::classifier`.
* Defaults to `Unknown` when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
export declare enum ChunkType {
/** Section heading or document title. */
Heading = "heading",
/** Party list: names, addresses, and signatories. */
PartyList = "party_list",
/** Definition clause ("X means…", "X shall mean…"). */
Definitions = "definitions",
/** Operative clause containing legal/contractual action verbs. */
OperativeClause = "operative_clause",
/** Signature block with signatures, names, and dates. */
SignatureBlock = "signature_block",
/** Schedule, annex, appendix, or exhibit section. */
Schedule = "schedule",
/** Table-like content with aligned columns or repeated patterns. */
TableLike = "table_like",
/** Mathematical formula or equation. */
Formula = "formula",
/** Code block or preformatted content. */
CodeBlock = "code_block",
/** Embedded or referenced image content. */
Image = "image",
/** Organizational chart or hierarchy diagram. */
OrgChart = "org_chart",
/** Diagram, figure, or visual illustration. */
Diagram = "diagram",
/** Unclassified or mixed content. */
Unknown = "unknown",
}
/** Citation file metadata (RIS, PubMed, EndNote). */
export interface CitationMetadata {
readonly citationCount?: number
readonly format?: string
readonly authors?: Array<string>
readonly yearRange?: YearRange
readonly dois?: Array<string>
readonly keywords?: Array<string>
}
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the `content` field
* of `ExtractionResult`.
*/
export declare enum CodeContentMode {
/** Use TSLP semantic chunks as content (default). */
Chunks = "chunks",
/** Use raw source code as content. */
Raw = "raw",
/** Emit function/class headings + docstrings (no code bodies). */
Structure = "structure",
}
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `None` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
export interface ContentFilterConfig {
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*/
readonly includeHeaders?: boolean
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*/
readonly includeFooters?: boolean
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*/
readonly stripRepeatingText?: boolean
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*/
readonly includeWatermarks?: boolean
}
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
export declare enum ContentLayer {
/** Main document body content. */
Body = "body",
/** Page/section header (running header). */
Header = "header",
/** Page/section footer (running footer). */
Footer = "footer",
/** Footnote content. */
Footnote = "footnote",
}
/** JATS contributor with role. */
export interface ContributorRole {
readonly name: string
readonly role?: string
}
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
export interface CoreProperties {
/** Document title */
readonly title?: string
/** Document subject/topic */
readonly subject?: string
/** Document creator/author */
readonly creator?: string
/** Keywords or tags */
readonly keywords?: string
/** Document description/abstract */
readonly description?: string
/** User who last modified the document */
readonly lastModifiedBy?: string
/** Revision number */
readonly revision?: string
/** Creation timestamp (ISO 8601) */
readonly created?: string
/** Last modification timestamp (ISO 8601) */
readonly modified?: string
/** Document category */
readonly category?: string
/** Content status (Draft, Final, etc.) */
readonly contentStatus?: string
/** Document language */
readonly language?: string
/** Unique identifier */
readonly identifier?: string
/** Document version */
readonly version?: string
/** Last print timestamp (ISO 8601) */
readonly lastPrinted?: string
}
/** CSV/TSV file metadata. */
export interface CsvMetadata {
readonly rowCount?: number
readonly columnCount?: number
readonly delimiter?: string
readonly hasHeader?: boolean
readonly columnTypes?: Array<string>
}
/** dBASE field information. */
export interface DbfFieldInfo {
readonly name: string
readonly fieldType: string
}
/** dBASE (DBF) file metadata. */
export interface DbfMetadata {
readonly recordCount?: number
readonly fieldCount?: number
readonly fields?: Array<DbfFieldInfo>
}
/** Page-level detection result containing all detections and page metadata. */
export interface DetectionResult {
readonly pageWidth: number
readonly pageHeight: number
readonly detections: Array<LayoutDetection>
}
/** MIME type detection response. */
export interface DetectResponse {
/** Detected MIME type */
readonly mimeType: string
/** Original filename (if provided) */
readonly filename?: string
}
/** A single contiguous hunk in a unified diff. */
export interface DiffHunk {
/** Starting line number in the old content (0-indexed). */
readonly fromLine: number
/** Number of lines from the old content in this hunk. */
readonly fromCount: number
/** Starting line number in the new content (0-indexed). */
readonly toLine: number
/** Number of lines from the new content in this hunk. */
readonly toCount: number
/** Lines that make up this hunk. */
readonly lines: Array<DiffLine>
}
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
export type DiffLine =
| { kind: 'context'; 0: string }
| { kind: 'added'; 0: string }
| { kind: 'removed'; 0: string }
/** Options controlling how two `ExtractionResult` values are compared. */
export interface DiffOptions {
/** Include metadata changes in the diff. Default: `true`. */
readonly includeMetadata?: boolean
/** Include embedded-children changes in the diff. Default: `true`. */
readonly includeEmbedded?: boolean
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `None` means no truncation.
*/
readonly maxContentChars?: number
}
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
export interface DjotContent {
/** Plain text representation for backwards compatibility */
readonly plainText: string
/** Structured block-level content */
readonly blocks: Array<FormattedBlock>
/** Metadata from YAML frontmatter */
readonly metadata: Metadata
/** Extracted tables as structured data */
readonly tables: Array<Table>
/** Extracted images with metadata */
readonly images: Array<DjotImage>
/** Extracted links with URLs */
readonly links: Array<DjotLink>
/** Footnote definitions */
readonly footnotes: Array<Footnote>
/** Attributes mapped by element identifier (if present) */
readonly attributes: Array<string>
}
/** Image element in Djot. */
export interface DjotImage {
/** Image source URL or path */
readonly src: string
/** Alternative text */
readonly alt: string
/** Optional title */
readonly title?: string
/** Element attributes */
readonly attributes?: string
}
/** Link element in Djot. */
export interface DjotLink {
/** Link URL */
readonly url: string
/** Link text content */
readonly text: string
/** Optional title */
readonly title?: string
/** Element attributes */
readonly attributes?: string
}
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
*/
export interface DocumentExtractor {
name(): string
/**
* Extract content from a byte array.
*
* This is the core extraction method that processes in-memory document data.
* @param content - Raw document bytes
*
* @param mime_type - MIME type of the document (already validated)
*
* @param config - Extraction configuration
*
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
* The pipeline will convert this into the public `ExtractionResult`.
*
* @throws - `KreuzbergError::Parsing` - Document parsing failed
* - `KreuzbergError::Validation` - Invalid document structure
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
* - `KreuzbergError::MissingDependency` - Required dependency not available
*/
extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
/**
* Extract content from a file.
*
* Default implementation reads the file and calls `extract_bytes`.
* Override for custom file handling, streaming, or memory optimizations.
* @param path - Path to the document file
*
* @param mime_type - MIME type of the document (already validated)
*
* @param config - Extraction configuration
*
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
*
* @throws Same as `extract_bytes`, plus file I/O errors.
*/
extractFile?(path: string, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
/**
* Get the list of MIME types supported by this extractor.
*
* Can include exact MIME types and prefix patterns:
* - Exact: `"application/pdf"`, `"text/plain"`
* - Prefix: `"image/*"` (matches any image type)
* @returns A slice of MIME type strings.
*/
supportedMimeTypes(): string
/**
* Get the priority of this extractor.
*
* Higher priority extractors are preferred when multiple extractors
* support the same MIME type.
*
* # Priority Guidelines
*
* - **0-25**: Fallback/low-quality extractors
* - **26-49**: Alternative extractors
* - **50**: Default priority (built-in extractors)
* - **51-75**: Premium/enhanced extractors
* - **76-100**: Specialized/high-priority extractors
* @returns Priority value (default: 50)
*/
priority?(): string
/**
* Optional: Check if this extractor can handle a specific file.
*
* Allows for more sophisticated detection beyond MIME types.
* Defaults to `true` (rely on MIME type matching).
* @param path - Path to the file to check
*
* @param mime_type - Detected MIME type
*
* @returns `true` if the extractor can handle this file, `false` otherwise.
*/
canHandle?(path: string, mimeType: string): string
}
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
export interface DocumentNode {
/** Deterministic identifier (hash of content + position). */
readonly id: string
/** Node content — tagged enum, type-specific data only. */
readonly content: NodeContent
/** Parent node index (`None` = root-level node). */
readonly parent?: number
/** Child node indices in reading order. */
readonly children: Array<number>
/** Content layer classification. */
readonly contentLayer: ContentLayer
/** Page number where this node starts (1-indexed). */
readonly page?: number
/** Page number where this node ends (for multi-page tables/sections). */
readonly pageEnd?: number
/** Bounding box in document coordinates. */
readonly bbox?: BoundingBox
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*/
readonly annotations: Array<TextAnnotation>
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*/
readonly attributes?: Record<string, string>
}
/** A resolved relationship between two nodes in the document tree. */
export interface DocumentRelationship {
/** Source node index (the referencing node). */
readonly source: number
/** Target node index (the referenced node). */
readonly target: number
/** Semantic kind of the relationship. */
readonly kind: RelationshipKind
}
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, ). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
export interface DocumentRevision {
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, ).
*/
readonly revisionId: string
/** Display name of the author who made this change, when available. */
readonly author?: string
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*/
readonly timestamp?: string
/** Semantic kind of this revision. */
readonly kind: RevisionKind
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `None` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*/
readonly anchor?: RevisionAnchor
/** The content changes that make up this revision. */
readonly delta: RevisionDelta
}
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
export interface DocumentStructure {
/** All nodes in document/reading order. */
readonly nodes?: Array<DocumentNode>
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*/
readonly sourceFormat?: string
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*/
readonly relationships?: Array<DocumentRelationship>
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, ).
*
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*/
readonly nodeTypes?: Array<string>
}
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
export interface DocxAppProperties {
/** Application name (e.g., "Microsoft Office Word") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Template filename */
readonly template?: string
/** Total editing time in minutes */
readonly totalTime?: number
/** Number of pages */
readonly pages?: number
/** Number of words */
readonly words?: number
/** Number of characters (excluding spaces) */
readonly characters?: number
/** Number of characters (including spaces) */
readonly charactersWithSpaces?: number
/** Number of lines */
readonly lines?: number
/** Number of paragraphs */
readonly paragraphs?: number
/** Company name */
readonly company?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
}
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
export interface DocxMetadata {
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*/
readonly coreProperties?: CoreProperties
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*/
readonly appProperties?: DocxAppProperties
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*/
readonly customProperties?: Record<string, JsonValue>
}
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
export interface Element {
/** Unique element identifier */
readonly elementId: string
/** Semantic type of this element */
readonly elementType: ElementType
/** Text content of the element */
readonly text: string
/** Metadata about the element */
readonly metadata: ElementMetadata
}
/** Metadata for a semantic element. */
export interface ElementMetadata {
/** Page number (1-indexed) */
readonly pageNumber?: number
/** Source filename or document name */
readonly filename?: string
/** Bounding box coordinates if available */
readonly coordinates?: BoundingBox
/** Position index in the element sequence */
readonly elementIndex?: number
/** Additional custom metadata */
readonly additional: Record<string, string>
}
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
export declare enum ElementType {
/** Document title */
Title = "title",
/** Main narrative text body */
NarrativeText = "narrative_text",
/** Section heading */
Heading = "heading",
/** List item (bullet, numbered, etc.) */
ListItem = "list_item",
/** Table element */
Table = "table",
/** Image element */
Image = "image",
/** Page break marker */
PageBreak = "page_break",
/** Code block */
CodeBlock = "code_block",
/** Block quote */
BlockQuote = "block_quote",
/** Footer text */
Footer = "footer",
/** Header text */
Header = "header",
}
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
export interface EmailAttachment {
/** Attachment name (from Content-Disposition header) */
readonly name?: string
/** Filename of the attachment */
readonly filename?: string
/** MIME type of the attachment */
readonly mimeType?: string
/** Size in bytes */
readonly size?: number
/** Whether this attachment is an image */
readonly isImage: boolean
/**
* Attachment data (if extracted).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*/
readonly data?: Uint8Array
}
/** Configuration for email extraction. */
export interface EmailConfig {
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `None`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*/
readonly msgFallbackCodepage?: number
}
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
export interface EmailExtractionResult {
/** Email subject line */
readonly subject?: string
/** Sender email address */
readonly fromEmail?: string
/** Primary recipient email addresses */
readonly toEmails: Array<string>
/** CC recipient email addresses */
readonly ccEmails: Array<string>
/** BCC recipient email addresses */
readonly bccEmails: Array<string>
/** Email date/timestamp */
readonly date?: string
/** Message-ID header value */
readonly messageId?: string
/** Plain text version of the email body */
readonly plainText?: string
/** HTML version of the email body */
readonly htmlContent?: string
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
readonly content: string
/** List of email attachments */
readonly attachments: Array<EmailAttachment>
/** Additional email headers and metadata */
readonly metadata: Record<string, string>
}
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
export interface EmailMetadata {
/** Sender's email address */
readonly fromEmail?: string
/** Sender's display name */
readonly fromName?: string
/** Primary recipients */
readonly toEmails?: Array<string>
/** CC recipients */
readonly ccEmails?: Array<string>
/** BCC recipients */
readonly bccEmails?: Array<string>
/** Message-ID header value */
readonly messageId?: string
/** List of attachment filenames */
readonly attachments?: Array<string>
}
/** Changes to embedded archive children between two results. */
export interface EmbeddedChanges {
/** Children present in `b` but not in `a` (matched by `path`). */
readonly added: Array<ArchiveEntry>
/** Children present in `a` but not in `b` (matched by `path`). */
readonly removed: Array<ArchiveEntry>
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*/
readonly changed: Array<EmbeddedDiff>
}
/** Diff for a single embedded archive entry that appears in both results. */
export interface EmbeddedDiff {
/** Archive-relative path identifying this entry. */
readonly path: string
/** The recursive diff of the entry's extraction result. */
readonly diff: ExtractionDiff
}
/** Embedded file descriptor extracted from the PDF name tree. */
export interface EmbeddedFile {
/** The filename as stored in the PDF name tree. */
readonly name: string
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
readonly data: Uint8Array
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*/
readonly compressedSize: number
/** MIME type if specified in the filespec, otherwise `None`. */
readonly mimeType?: string
}
/**
* Trait for in-process embedding backend plugins.
*
* Async to match the convention used by `OcrBackend`,
* `DocumentExtractor`, and `PostProcessor`.
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
* equivalent to satisfy the async signature.
*
* # Thread safety
*
* Backends must be `Send + Sync + 'static`. They are stored in
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
* pipeline. If the backend's underlying model isn't thread-safe, the backend
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
*
* # Contract
*
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
* `self.dimensions()`. The dispatcher in `embed_texts`
* validates this before returning to downstream consumers; a non-conforming
* backend surfaces as a `KreuzbergError::Validation`, not a panic.
* - `embed` may be called from any thread. Its future must be `Send`
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
* - `dimensions()` is called exactly once at registration, immediately after
* `initialize()` succeeds. The returned value is cached by the registry and
* used for all subsequent shape validation. Lazy-loading implementations can
* defer model loading into `initialize()` and report the real dimension
* afterwards. Later mutations of the backend's reported dimension are not
* observed by kreuzberg implementations that need to change dimension
* must unregister and re-register.
* - `shutdown()` (inherited from `Plugin`) may be invoked
* concurrently with an in-flight `embed()` call. Implementations must
* tolerate this e.g. by letting in-flight calls finish using resources
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
* shared state that isn't needed by `embed`.
*
* # Runtime
*
* The synchronous `embed_texts` entry uses
* [`tokio::task::block_in_place`] to await the trait's async `embed`, which
* requires a multi-thread tokio runtime. Callers running inside a
* `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
* or `tokio::runtime::Builder::new_current_thread()`) must use
* `embed_texts_async` instead, which awaits directly without
* `block_in_place`.
*/
export interface EmbeddingBackend {
name(): string
/**
* Embedding vector dimension. Must be `> 0` and must match the length of
* every vector returned by `embed`.
*/
dimensions(): string
/**
* Embed a batch of texts, returning one vector per input in order.
* @throws Implementations should return `Plugin` for
* backend-specific failures. The dispatcher layers its own validation
* (length, per-vector dimension) on top.
*/
embed(texts: Array<string>): Promise<string>
}
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
export interface EmbeddingConfig {
/** The embedding model to use (defaults to "balanced" preset if not specified) */
readonly model?: EmbeddingModelType
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
readonly normalize?: boolean
/** Batch size for embedding generation */
readonly batchSize?: number
/** Show model download progress */
readonly showDownloadProgress?: boolean
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*/
readonly cacheDir?: string
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*/
readonly acceleration?: AccelerationConfig
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using [`EmbeddingModelType::Plugin`].
*
* Applies only to the in-process plugin path protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `None` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*/
readonly maxEmbedDurationSecs?: number
}
/** Embedding model types supported by Kreuzberg. */
export type EmbeddingModelType =
| { type: 'preset'; name: string }
| { type: 'custom'; modelId: string; dimensions: number }
| { type: 'llm'; llm: LlmConfig }
| { type: 'plugin'; name: string }
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility instances
* are safe to clone and pass across language boundaries.
*/
export interface EmbeddingPreset {
readonly name: string
readonly chunkSize: number
readonly overlap: number
/** HuggingFace repository name for the model. */
readonly modelRepo: string
/** Pooling strategy: "cls" or "mean". */
readonly pooling: string
/** Path to the ONNX model file within the repo. */
readonly modelFile: string
readonly dimensions: number
readonly description: string
}
/** EPUB metadata (Dublin Core extensions). */
export interface EpubMetadata {
readonly coverage?: string
readonly dcFormat?: string
readonly relation?: string
readonly source?: string
readonly dcType?: string
readonly coverImage?: string
}
/** Error metadata (for batch operations). */
export interface ErrorMetadata {
readonly errorType: string
readonly message: string
}
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
export interface ExcelMetadata {
/** Number of sheets in the workbook. */
readonly sheetCount?: number
/** Names of all sheets in the workbook. */
readonly sheetNames?: Array<string>
}
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
export interface ExcelSheet {
/** Sheet name as it appears in Excel */
readonly name: string
/** Sheet content converted to Markdown tables */
readonly markdown: string
/** Number of rows */
readonly rowCount: number
/** Number of columns */
readonly colCount: number
/** Total number of non-empty cells */
readonly cellCount: number
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*/
readonly tableCells?: Array<Array<string>>
}
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
export interface ExcelWorkbook {
/** All sheets in the workbook */
readonly sheets: Array<ExcelSheet>
/** Workbook-level metadata (author, creation date, etc.) */
readonly metadata: Record<string, string>
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* ( `revision_id`), `userName` ( `author`), and `dateTime` ( `timestamp`).
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
*/
readonly revisions?: Array<DocumentRevision>
}
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* `Auto` (default) selects the best available provider per platform.
*/
export declare enum ExecutionProviderType {
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
Auto = "auto",
/** CPU execution provider (always available). */
Cpu = "cpu",
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
CoreMl = "coreml",
/** NVIDIA CUDA GPU acceleration. */
Cuda = "cuda",
/** NVIDIA TensorRT (optimized CUDA inference). */
TensorRt = "tensorrt",
}
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
export interface ExtractedImage {
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*/
readonly data: Uint8Array
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*/
readonly format: string
/** Zero-indexed position of this image in the document/page */
readonly imageIndex: number
/** Page/slide number where image was found (1-indexed) */
readonly pageNumber?: number
/** Image width in pixels */
readonly width?: number
/** Image height in pixels */
readonly height?: number
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
readonly colorspace?: string
/** Bits per color component (e.g., 8, 16) */
readonly bitsPerComponent?: number
/** Whether this image is a mask image */
readonly isMask: boolean
/** Optional description of the image */
readonly description?: string
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*/
readonly ocrResult?: ExtractionResult
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*/
readonly boundingBox?: BoundingBox
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*/
readonly sourcePath?: string
/**
* Heuristic classification of what this image likely depicts.
* `None` if classification was disabled or inconclusive.
*/
readonly imageKind?: ImageKind
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
readonly kindConfidence?: number
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
*/
readonly clusterId?: number
}
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
export interface ExtractedUri {
/** The URL or path string. */
readonly url: string
/** Optional display text / label for the link. */
readonly label?: string
/** Optional page number where the URI was found (1-indexed). */
readonly page?: number
/** Semantic classification of the URI. */
readonly kind: UriKind
}
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*/
export interface ExtractionConfig {
/** Enable caching of extraction results */
readonly useCache?: boolean
/** Enable quality post-processing */
readonly enableQualityProcessing?: boolean
/** OCR configuration (None = OCR disabled) */
readonly ocr?: OcrConfig
/** Force OCR even for searchable PDFs */
readonly forceOcr?: boolean
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*/
readonly forceOcrPages?: Array<number>
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*/
readonly disableOcr?: boolean
/** Text chunking configuration (None = chunking disabled) */
readonly chunking?: ChunkingConfig
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See [`ContentFilterConfig`] for per-field documentation.
*/
readonly contentFilter?: ContentFilterConfig
/** Image extraction configuration (None = no image extraction) */
readonly images?: ImageExtractionConfig
/** PDF-specific options (None = use defaults) */
readonly pdfOptions?: PdfConfig
/** Token reduction configuration (None = no token reduction) */
readonly tokenReduction?: TokenReductionOptions
/** Language detection configuration (None = no language detection) */
readonly languageDetection?: LanguageDetectionConfig
/** Page extraction configuration (None = no page tracking) */
readonly pages?: PageConfig
/** Keyword extraction configuration (None = no keyword extraction) */
readonly keywords?: KeywordConfig
/** Post-processor configuration (None = use defaults) */
readonly postprocessor?: PostProcessorConfig
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*/
readonly htmlOptions?: string
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat::Html`, the extraction
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `None`, the existing plain comrak-based HTML renderer is used.
*/
readonly htmlOutput?: HtmlOutputConfig
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by [`FileExtractionConfig::timeout_secs`].
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `None` to
* disable the timeout for trusted input or long-running workloads.
*/
readonly extractionTimeoutSecs?: number
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*/
readonly maxConcurrentExtractions?: number
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*/
readonly resultFormat?: ResultFormat
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `None`, default limits are used.
*/
readonly securityLimits?: SecurityLimits
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `None` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*/
readonly maxEmbeddedFileBytes?: number
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*/
readonly outputFormat?: OutputFormat
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*/
readonly layout?: LayoutDetectionConfig
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*/
readonly useLayoutForMarkdown?: boolean
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` can be combined with Unified or ElementBased.
*/
readonly includeDocumentStructure?: boolean
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*/
readonly acceleration?: AccelerationConfig
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*/
readonly cacheNamespace?: string
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `None`, the global TTL applies.
*/
readonly cacheTtlSecs?: number
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*/
readonly email?: EmailConfig
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*/
readonly concurrency?: string
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*/
readonly maxArchiveDepth?: number
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*/
readonly treeSitter?: TreeSitterConfig
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult::structured_output`.
*/
readonly structuredExtraction?: StructuredExtractionConfig
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*/
readonly cancelToken?: string
}
/** The complete diff between two `ExtractionResult` values. */
export interface ExtractionDiff {
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*/
readonly contentDiff: Array<DiffHunk>
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
readonly tablesAdded: Array<Table>
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
readonly tablesRemoved: Array<Table>
/** Cell-level changes for table pairs that share the same index and dimensions. */
readonly tablesChanged: Array<TableDiff>
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*/
readonly metadataChanged: JsonValue
/** Changes to embedded archive children. */
readonly embeddedChanges: EmbeddedChanges
}
/** How the extracted text was produced. */
export declare enum ExtractionMethod {
Native = "native",
Ocr = "ocr",
Mixed = "mixed",
}
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
export interface ExtractionResult {
readonly content?: string
readonly mimeType?: string
readonly metadata?: Metadata
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*/
readonly extractionMethod?: ExtractionMethod
readonly tables?: Array<Table>
readonly detectedLanguages?: Array<string>
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*/
readonly chunks?: Array<Chunk>
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*/
readonly images?: Array<ExtractedImage>
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*/
readonly pages?: Array<PageContent>
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*/
readonly elements?: Array<Element>
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `None` for non-Djot documents.
*/
readonly djotContent?: DjotContent
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*/
readonly ocrElements?: Array<OcrElement>
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` can be combined with Unified or ElementBased.
*/
readonly document?: DocumentStructure
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*/
readonly extractedKeywords?: Array<Keyword>
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*/
readonly qualityScore?: number
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*/
readonly processingWarnings?: Array<ProcessingWarning>
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*/
readonly annotations?: Array<PdfAnnotation>
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `None` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*/
readonly children?: Array<ArchiveEntry>
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*/
readonly uris?: Array<ExtractedUri>
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* ). Every extractor defaults to `None` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent the revision list is the separate
* audit trail.
*/
readonly revisions?: Array<DocumentRevision>
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*/
readonly structuredOutput?: JsonValue
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, ) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
*/
readonly codeIntelligence?: JsonValue
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `None` when no LLM was used.
*/
readonly llmUsage?: Array<LlmUsage>
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*/
readonly formattedContent?: string
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*/
readonly ocrInternalDocument?: string
}
/** FictionBook (FB2) metadata. */
export interface FictionBookMetadata {
readonly genres?: Array<string>
readonly sequences?: Array<string>
readonly annotation?: string
}
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` `None` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
* - `max_concurrent_extractions` controls batch parallelism
* - `use_cache` global caching policy
* - `acceleration` shared ONNX execution provider
* - `security_limits` global archive security policy
*/
export interface FileExtractionConfig {
/** Override quality post-processing for this file. */
readonly enableQualityProcessing?: boolean
/** Override OCR configuration for this file (None in the Option = use batch default). */
readonly ocr?: OcrConfig
/** Override force OCR for this file. */
readonly forceOcr?: boolean
/** Override force OCR pages for this file (1-indexed page numbers). */
readonly forceOcrPages?: Array<number>
/** Override disable OCR for this file. */
readonly disableOcr?: boolean
/** Override chunking configuration for this file. */
readonly chunking?: ChunkingConfig
/** Override content filtering configuration for this file. */
readonly contentFilter?: ContentFilterConfig
/** Override image extraction configuration for this file. */
readonly images?: ImageExtractionConfig
/** Override PDF options for this file. */
readonly pdfOptions?: PdfConfig
/** Override token reduction for this file. */
readonly tokenReduction?: TokenReductionOptions
/** Override language detection for this file. */
readonly languageDetection?: LanguageDetectionConfig
/** Override page extraction for this file. */
readonly pages?: PageConfig
/** Override keyword extraction for this file. */
readonly keywords?: KeywordConfig
/** Override post-processor for this file. */
readonly postprocessor?: PostProcessorConfig
/** Override HTML conversion options for this file. */
readonly htmlOptions?: string
/** Override result format for this file. */
readonly resultFormat?: ResultFormat
/** Override output content format for this file. */
readonly outputFormat?: OutputFormat
/** Override document structure output for this file. */
readonly includeDocumentStructure?: boolean
/** Override layout detection for this file. */
readonly layout?: LayoutDetectionConfig
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*/
readonly timeoutSecs?: number
/** Override tree-sitter configuration for this file. */
readonly treeSitter?: TreeSitterConfig
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*/
readonly structuredExtraction?: StructuredExtractionConfig
}
/** Footnote in Djot. */
export interface Footnote {
/** Footnote label */
readonly label: string
/** Footnote content blocks */
readonly content: Array<FormattedBlock>
}
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
export type FormatMetadata =
| { format_type: 'pdf'; 0: PdfMetadata }
| { format_type: 'docx'; 0: DocxMetadata }
| { format_type: 'excel'; 0: ExcelMetadata }
| { format_type: 'email'; 0: EmailMetadata }
| { format_type: 'pptx'; 0: PptxMetadata }
| { format_type: 'archive'; 0: ArchiveMetadata }
| { format_type: 'image'; 0: ImageMetadata }
| { format_type: 'xml'; 0: XmlMetadata }
| { format_type: 'text'; 0: TextMetadata }
| { format_type: 'html'; 0: HtmlMetadata }
| { format_type: 'ocr'; 0: OcrMetadata }
| { format_type: 'csv'; 0: CsvMetadata }
| { format_type: 'bibtex'; 0: BibtexMetadata }
| { format_type: 'citation'; 0: CitationMetadata }
| { format_type: 'fiction_book'; 0: FictionBookMetadata }
| { format_type: 'dbf'; 0: DbfMetadata }
| { format_type: 'jats'; 0: JatsMetadata }
| { format_type: 'epub'; 0: EpubMetadata }
| { format_type: 'pst'; 0: PstMetadata }
| { format_type: 'code'; 0: string }
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
export interface FormattedBlock {
/** Type of block element */
readonly blockType: BlockType
/** Heading level (1-6) for headings, or nesting level for lists */
readonly level?: number
/** Inline content within the block */
readonly inlineContent: Array<InlineElement>
/** Element attributes (classes, IDs, key-value pairs) */
readonly attributes?: string
/** Language identifier for code blocks */
readonly language?: string
/** Raw code content for code blocks */
readonly code?: string
/** Nested blocks for containers (blockquotes, list items, divs) */
readonly children: Array<FormattedBlock>
}
/** Individual grid cell with position and span metadata. */
export interface GridCell {
/** Cell text content. */
readonly content: string
/** Zero-indexed row position. */
readonly row: number
/** Zero-indexed column position. */
readonly col: number
/** Number of rows this cell spans. */
readonly rowSpan: number
/** Number of columns this cell spans. */
readonly colSpan: number
/** Whether this is a header cell. */
readonly isHeader: boolean
/** Bounding box for this cell (if available). */
readonly bbox?: BoundingBox
}
/** Header/heading element metadata. */
export interface HeaderMetadata {
/** Header level: 1 (h1) through 6 (h6) */
readonly level: number
/** Normalized text content of the header */
readonly text: string
/** HTML id attribute if present */
readonly id?: string
/** Document tree depth at the header element */
readonly depth: number
/** Byte offset in original HTML document */
readonly htmlOffset: number
}
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
export interface HeadingContext {
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*/
readonly headings: Array<HeadingLevel>
}
/** A single heading in the hierarchy. */
export interface HeadingLevel {
/** Heading depth (1 = h1, 2 = h2, etc.) */
readonly level: number
/** The text content of the heading. */
readonly text: string
}
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
export interface HierarchicalBlock {
/** The text content of this block */
readonly text: string
/** The font size of the text in this block */
readonly fontSize: number
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*/
readonly level: string
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*/
readonly bbox?: Array<number>
}
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
export interface HierarchyConfig {
/** Enable hierarchy extraction */
readonly enabled?: boolean
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*/
readonly kClusters?: number
/** Include bounding box information in hierarchy blocks */
readonly includeBbox?: boolean
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*/
readonly ocrCoverageThreshold?: number
}
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
export interface HtmlMetadata {
/** Document title from `<title>` tag */
readonly title?: string
/** Document description from `<meta name="description">` tag */
readonly description?: string
/** Document keywords from `<meta name="keywords">` tag, split on commas */
readonly keywords?: Array<string>
/** Document author from `<meta name="author">` tag */
readonly author?: string
/** Canonical URL from `<link rel="canonical">` tag */
readonly canonicalUrl?: string
/** Base URL from `<base href="">` tag for resolving relative URLs */
readonly baseHref?: string
/** Document language from `lang` attribute */
readonly language?: string
/** Document text direction from `dir` attribute */
readonly textDirection?: TextDirection
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*/
readonly openGraph?: Record<string, string>
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*/
readonly twitterCard?: Record<string, string>
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*/
readonly metaTags?: Record<string, string>
/** Extracted header elements with hierarchy */
readonly headers?: Array<HeaderMetadata>
/** Extracted hyperlinks with type classification */
readonly links?: Array<LinkMetadata>
/** Extracted images with source and dimensions */
readonly images?: Array<ImageMetadataType>
/** Extracted structured data blocks */
readonly structuredData?: Array<StructuredData>
}
/**
* Configuration for styled HTML output.
*
* When set on [`ExtractionConfig::html_output`] alongside
* `output_format = OutputFormat::Html`, the pipeline builds a
* [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
*/
export interface HtmlOutputConfig {
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*/
readonly css?: string
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*/
readonly cssFile?: string
/** Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
readonly theme?: HtmlTheme
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*/
readonly classPrefix?: string
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*/
readonly embedCss?: boolean
}
/** Built-in HTML theme selection. */
export declare enum HtmlTheme {
/**
* Sensible defaults: system font stack, neutral colours, readable line
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
* can override individual values.
*/
Default = "default",
/** GitHub Markdown-inspired palette and spacing. */
GitHub = "github",
/** Dark background, light text. */
Dark = "dark",
/** Minimal light theme with generous whitespace. */
Light = "light",
/**
* No built-in stylesheet emitted. CSS custom properties are still defined
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
*/
Unstyled = "unstyled",
}
/** Image extraction configuration. */
export interface ImageExtractionConfig {
/** Extract images from documents */
readonly extractImages?: boolean
/** Target DPI for image normalization */
readonly targetDpi?: number
/** Maximum dimension for images (width or height) */
readonly maxImageDimension?: number
/**
* Whether to inject image reference placeholders into markdown output.
* When `true` (default), image references like `![Image 1](embedded:p1_i0)`
* are appended to the markdown. Set to `false` to extract images as data
* without polluting the markdown output.
*/
readonly injectPlaceholders?: boolean
/** Automatically adjust DPI based on image content */
readonly autoAdjustDpi?: boolean
/** Minimum DPI threshold */
readonly minDpi?: number
/** Maximum DPI threshold */
readonly maxDpi?: number
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* `None` (default) means no limit all images are extracted.
*/
readonly maxImagesPerPage?: number
/**
* When `true` (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*/
readonly classify?: boolean
/**
* When `true`, full-page renders produced during OCR preprocessing are captured
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
*
* Defaults to `false`. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*/
readonly includePageRasters?: boolean
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
* are processed with the configured OCR backend. Set to `false` to extract images
* without OCR processing, even when OCR is enabled.
*/
readonly runOcrOnImages?: boolean
/**
* When `true`, image OCR results are rendered as plain text without the
* `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
* is also `true`.
*/
readonly ocrTextOnly?: boolean
/**
* When `true` and `ocr_text_only` is `false`, append the OCR text after
* the image placeholder in the rendered output.
*/
readonly appendOcrText?: boolean
}
/** Heuristic classification of what an image likely depicts. */
export declare enum ImageKind {
/** Photographic image (natural scene, photograph) */
Photograph = "photograph",
/** Technical or schematic diagram */
Diagram = "diagram",
/** Chart, graph, or plot */
Chart = "chart",
/** Freehand or technical drawing */
Drawing = "drawing",
/** Text-heavy image (scanned text, document) */
TextBlock = "text_block",
/** Decorative element or border */
Decoration = "decoration",
/** Logo or brand mark */
Logo = "logo",
/** Small icon */
Icon = "icon",
/** Fragment of a larger tiled image (tile of a technical drawing) */
TileFragment = "tile_fragment",
/** Mask or transparency map */
Mask = "mask",
/** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
PageRaster = "page_raster",
/** Could not classify with reasonable confidence */
Unknown = "unknown",
}
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
export interface ImageMetadata {
/** Image width in pixels */
readonly width?: number
/** Image height in pixels */
readonly height?: number
/** Image format (e.g., "PNG", "JPEG", "TIFF") */
readonly format?: string
/** EXIF metadata tags */
readonly exif?: Record<string, string>
}
/** Image element metadata. */
export interface ImageMetadataType {
/** Image source (URL, data URI, or SVG content) */
readonly src: string
/** Alternative text from alt attribute */
readonly alt?: string
/** Title attribute */
readonly title?: string
/** Image dimensions as (width, height) if available */
readonly dimensions?: Array<number>
/** Image type classification */
readonly imageType: ImageType
/** Additional attributes as key-value pairs */
readonly attributes: Array<Array<string>>
}
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
export interface ImagePreprocessingConfig {
/** Target DPI for the image (300 is standard, 600 for small text). */
readonly targetDpi?: number
/** Auto-detect and correct image rotation. */
readonly autoRotate?: boolean
/** Correct skew (tilted images). */
readonly deskew?: boolean
/** Remove noise from the image. */
readonly denoise?: boolean
/** Enhance contrast for better text visibility. */
readonly contrastEnhance?: boolean
/** Binarization method: "otsu", "sauvola", "adaptive". */
readonly binarizationMethod?: string
/** Invert colors (white text on black → black on white). */
readonly invertColors?: boolean
}
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
export interface ImagePreprocessingMetadata {
/** Original image dimensions (width, height) in pixels */
readonly originalDimensions: Array<number>
/** Original image DPI (horizontal, vertical) */
readonly originalDpi: Array<number>
/** Target DPI from configuration */
readonly targetDpi: number
/** Scaling factor applied to the image */
readonly scaleFactor: number
/** Whether DPI was auto-adjusted based on content */
readonly autoAdjusted: boolean
/** Final DPI after processing */
readonly finalDpi: number
/** New dimensions after resizing (if resized) */
readonly newDimensions?: Array<number>
/** Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
readonly resampleMethod: string
/** Whether dimensions were clamped to max_image_dimension */
readonly dimensionClamped: boolean
/** Calculated optimal DPI (if auto_adjust_dpi enabled) */
readonly calculatedDpi?: number
/** Whether resize was skipped (dimensions already optimal) */
readonly skippedResize: boolean
/** Error message if resize failed */
readonly resizeError?: string
}
/** Image type classification. */
export declare enum ImageType {
/** Data URI image */
DataUri = "data-uri",
/** Inline SVG */
InlineSvg = "inline-svg",
/** External image URL */
External = "external",
/** Relative path image */
Relative = "relative",
}
/**
* Inline element within a block.
*
* Represents text with formatting, links, images, etc.
*/
export interface InlineElement {
/** Type of inline element */
readonly elementType: InlineType
/** Text content */
readonly content: string
/** Element attributes */
readonly attributes?: string
/** Additional metadata (e.g., href for links, src/alt for images) */
readonly metadata?: Record<string, string>
}
/** Types of inline elements in Djot. */
export declare enum InlineType {
Text = "text",
Strong = "strong",
Emphasis = "emphasis",
Highlight = "highlight",
Subscript = "subscript",
Superscript = "superscript",
Insert = "insert",
Delete = "delete",
Code = "code",
Link = "link",
Image = "image",
Span = "span",
Math = "math",
RawInline = "raw_inline",
FootnoteRef = "footnote_ref",
Symbol = "symbol",
}
/** JATS (Journal Article Tag Suite) metadata. */
export interface JatsMetadata {
readonly copyright?: string
readonly license?: string
readonly historyDates?: Record<string, string>
readonly contributorRoles?: Array<ContributorRole>
}
/** Extracted keyword with metadata. */
export interface Keyword {
/** The keyword text. */
readonly text: string
/** Relevance score (higher is better, algorithm-specific range). */
readonly score: number
/** Algorithm that extracted this keyword. */
readonly algorithm: KeywordAlgorithm
/** Optional positions where keyword appears in text (character offsets). */
readonly positions?: Array<number>
}
/** Keyword algorithm selection. */
export declare enum KeywordAlgorithm {
/** YAKE (Yet Another Keyword Extractor) - statistical approach */
Yake = "yake",
/** RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based */
Rake = "rake",
}
/** Keyword extraction configuration. */
export interface KeywordConfig {
/** Algorithm to use for extraction. */
readonly algorithm?: KeywordAlgorithm
/** Maximum number of keywords to extract (default: 10). */
readonly maxKeywords?: number
/**
* Minimum score threshold (0.0-1.0, default: 0.0).
*
* Keywords with scores below this threshold are filtered out.
* Note: Score ranges differ between algorithms.
*/
readonly minScore?: number
/**
* N-gram range for keyword extraction (min, max).
*
* (1, 1) = unigrams only
* (1, 2) = unigrams and bigrams
* (1, 3) = unigrams, bigrams, and trigrams (default)
*/
readonly ngramRange?: Array<number>
/**
* Language code for stopword filtering (e.g., "en", "de", "fr").
*
* If None, no stopword filtering is applied.
*/
readonly language?: string
/** YAKE-specific tuning parameters. */
readonly yakeParams?: YakeParams
/** RAKE-specific tuning parameters. */
readonly rakeParams?: RakeParams
}
/** Language detection configuration. */
export interface LanguageDetectionConfig {
/** Enable language detection */
readonly enabled?: boolean
/** Minimum confidence threshold (0.0-1.0) */
readonly minConfidence?: number
/** Detect multiple languages in the document */
readonly detectMultiple?: boolean
}
/**
* The 17 canonical document layout classes.
*
* All model backends (RT-DETR, YOLO, etc.) map their native class IDs
* to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
* map to the closest equivalent.
*
* Wire format is snake_case in all serializers (JSON, TOML, YAML).
*/
export declare enum LayoutClass {
Caption = "caption",
Footnote = "footnote",
Formula = "formula",
ListItem = "list_item",
PageFooter = "page_footer",
PageHeader = "page_header",
Picture = "picture",
SectionHeader = "section_header",
Table = "table",
Text = "text",
Title = "title",
DocumentIndex = "document_index",
Code = "code",
CheckboxSelected = "checkbox_selected",
CheckboxUnselected = "checkbox_unselected",
Form = "form",
KeyValueRegion = "key_value_region",
}
/** A single layout detection result. */
export interface LayoutDetection {
readonly className: LayoutClass
readonly confidence: number
readonly bbox: BBox
}
/**
* Layout detection configuration.
*
* Controls layout detection behavior in the extraction pipeline.
* When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
* is enabled for PDF extraction.
*/
export interface LayoutDetectionConfig {
/** Confidence threshold override (None = use model default). */
readonly confidenceThreshold?: number
/** Whether to apply postprocessing heuristics (default: true). */
readonly applyHeuristics?: boolean
/**
* Table structure recognition model.
*
* Controls which model is used for table cell detection within layout-detected
* table regions. Defaults to [`TableModel::Tatr`].
*/
readonly tableModel?: TableModel
/**
* Hardware acceleration for ONNX models (layout detection + table structure).
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*/
readonly acceleration?: AccelerationConfig
}
/**
* A detected layout region on a page.
*
* When layout detection is enabled, each page may have layout regions
* identifying different content types (text, pictures, tables, etc.)
* with confidence scores and spatial positions.
*/
export interface LayoutRegion {
/** Layout class name (e.g. "picture", "table", "text", "section_header"). */
readonly className?: string
/** Confidence score from the layout detection model (0.0 to 1.0). */
readonly confidence?: number
/** Bounding box in document coordinate space. */
readonly boundingBox?: BoundingBox
/** Fraction of the page area covered by this region (0.0 to 1.0). */
readonly areaFraction?: number
}
/** Link element metadata. */
export interface LinkMetadata {
/** The href URL value */
readonly href: string
/** Link text content (normalized) */
readonly text: string
/** Optional title attribute */
readonly title?: string
/** Link type classification */
readonly linkType: LinkType
/** Rel attribute values */
readonly rel: Array<string>
/** Additional attributes as key-value pairs */
readonly attributes: Array<Array<string>>
}
/** Link type classification. */
export declare enum LinkType {
/** Anchor link (#section) */
Anchor = "anchor",
/** Internal link (same domain) */
Internal = "internal",
/** External link (different domain) */
External = "external",
/** Email link (mailto:) */
Email = "email",
/** Phone link (tel:) */
Phone = "phone",
/** Other link type */
Other = "other",
}
/** Type of list detection. */
export declare enum ListType {
/** Bullet points (-, *, •, etc.) */
Bullet = "Bullet",
/** Numbered lists (1., 2., etc.) */
Numbered = "Numbered",
/** Lettered lists (a., b., A., B., etc.) */
Lettered = "Lettered",
/** Indented items */
Indented = "Indented",
}
/**
* Configuration for an LLM provider/model via liter-llm.
*
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
* its own `LlmConfig`, allowing different providers per feature.
* @example
* ```typescript
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
* ```typescript
*/
export interface LlmConfig {
/**
* Provider/model string using liter-llm routing format.
*
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
* `"groq/llama-3.1-70b-versatile"`.
*/
readonly model?: string
/**
* API key for the provider. When `None`, liter-llm falls back to
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
*/
readonly apiKey?: string
/** Custom base URL override for the provider endpoint. */
readonly baseUrl?: string
/** Request timeout in seconds (default: 60). */
readonly timeoutSecs?: number
/** Maximum retry attempts (default: 3). */
readonly maxRetries?: number
/** Sampling temperature for generation tasks. */
readonly temperature?: number
/** Maximum tokens to generate. */
readonly maxTokens?: number
}
/**
* Token usage and cost data for a single LLM call made during extraction.
*
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
* are used. Multiple entries may be present when multiple LLM calls occur
* within one extraction (e.g. VLM OCR + structured extraction).
*/
export interface LlmUsage {
/** The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
readonly model?: string
/**
* The pipeline stage that triggered this LLM call
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
*/
readonly source?: string
/** Number of input/prompt tokens consumed. */
readonly inputTokens?: number
/** Number of output/completion tokens generated. */
readonly outputTokens?: number
/** Total tokens (input + output). */
readonly totalTokens?: number
/** Estimated cost in USD based on the provider's published pricing. */
readonly estimatedCost?: number
/** Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
readonly finishReason?: string
}
/**
* Extraction result metadata.
*
* Contains common fields applicable to all formats, format-specific metadata
* via a discriminated union, and additional custom fields from postprocessors.
*/
export interface Metadata {
/** Document title */
readonly title?: string
/** Document subject or description */
readonly subject?: string
/** Primary author(s) - always Vec for consistency */
readonly authors?: Array<string>
/** Keywords/tags - always Vec for consistency */
readonly keywords?: Array<string>
/** Primary language (ISO 639 code) */
readonly language?: string
/** Creation timestamp (ISO 8601 format) */
readonly createdAt?: string
/** Last modification timestamp (ISO 8601 format) */
readonly modifiedAt?: string
/** User who created the document */
readonly createdBy?: string
/** User who last modified the document */
readonly modifiedBy?: string
/** Page/slide/sheet structure with boundaries */
readonly pages?: PageStructure
/**
* Format-specific metadata (discriminated union)
*
* Contains detailed metadata specific to the document format.
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
*/
readonly format?: FormatMetadata
/** Image preprocessing metadata (when OCR preprocessing was applied) */
readonly imagePreprocessing?: ImagePreprocessingMetadata
/** JSON schema (for structured data extraction) */
readonly jsonSchema?: JsonValue
/** Error metadata (for batch operations) */
readonly error?: ErrorMetadata
/**
* Extraction duration in milliseconds (for benchmarking).
*
* This field is populated by batch extraction to provide per-file timing
* information. It's `None` for single-file extraction (which uses external timing).
*/
readonly extractionDurationMs?: number
/** Document category (from frontmatter or classification). */
readonly category?: string
/** Document tags (from frontmatter). */
readonly tags?: Array<string>
/** Document version string (from frontmatter). */
readonly documentVersion?: string
/** Abstract or summary text (from frontmatter). */
readonly abstractText?: string
/**
* Output format identifier (e.g., "markdown", "html", "text").
*
* Set by the output format pipeline stage when format conversion is applied.
* Previously stored in `metadata.additional["output_format"]`.
*/
readonly outputFormat?: string
/**
* Whether OCR was used during extraction.
*
* Set to `true` whenever the extraction pipeline ran an OCR backend
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
* or fallback text. `false` means native text extraction was used exclusively.
*/
readonly ocrUsed?: boolean
/**
* Additional custom fields from postprocessors.
*
* Serialized as a nested `"additional"` object (not flattened at root level).
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
*/
readonly additional?: Record<string, JsonValue>
}
/** Combined paths to all models needed for OCR (backward compatibility). */
export interface ModelPaths {
/** Path to the detection model directory. */
readonly detModel: string
/** Path to the classification model directory. */
readonly clsModel: string
/** Path to the recognition model directory. */
readonly recModel: string
/** Path to the character dictionary file. */
readonly dictFile: string
}
/**
* Tagged enum for node content. Each variant carries only type-specific data.
*
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
* Go/Java/TypeScript bindings.
*/
export type NodeContent =
| { node_type: 'title'; text: string }
| { node_type: 'heading'; level: number; text: string }
| { node_type: 'paragraph'; text: string }
| { node_type: 'list'; ordered: boolean }
| { node_type: 'list_item'; text: string }
| { node_type: 'table'; grid: TableGrid }
| { node_type: 'image'; description: string; imageIndex: number; src: string }
| { node_type: 'code'; text: string; language: string }
| { node_type: 'quote' }
| { node_type: 'formula'; text: string }
| { node_type: 'footnote'; text: string }
| { node_type: 'group'; label: string; headingLevel: number; headingText: string }
| { node_type: 'page_break' }
| { node_type: 'slide'; number: number; title: string }
| { node_type: 'definition_list' }
| { node_type: 'definition_item'; term: string; definition: string }
| { node_type: 'citation'; key: string; text: string }
| { node_type: 'admonition'; kind: string; title: string }
| { node_type: 'raw_block'; format: string; content: string }
| { node_type: 'metadata_block'; entries: Array<Array<string>> }
/**
* Trait for OCR backend plugins.
*
* Implement this trait to add custom OCR capabilities. OCR backends can be:
* - Native Rust implementations (like Tesseract)
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
*
* # Thread Safety
*
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
*/
export interface OcrBackend {
name(): string
/**
* Process an image and extract text via OCR.
* @param image_bytes - Raw image data (JPEG, PNG, TIFF, etc.)
*
* @param config - OCR configuration (language, PSM mode, etc.)
*
* @returns An `ExtractionResult` containing the extracted text and metadata.
*
* @throws - `KreuzbergError::Ocr` - OCR processing failed
* - `KreuzbergError::Validation` - Invalid image format or configuration
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
*
* # Reading `backend_options`
*
* Backends that support runtime tuning can read `config.backend_options` and
* deserialize only the keys they care about. Unknown keys are silently ignored,
* so multiple backends can coexist in a pipeline without key conflicts.
*
* ```rust
* async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
* // Read backend-specific options; unknown keys are silently ignored.
* let fast_mode = config.backend_options
* .as_ref()
* .and_then(|v| v.get("mode"))
* .and_then(|v| v.as_str())
* .map(|s| s == "fast")
* .unwrap_or(false);
*
* if image_bytes.is_empty() {
* return Err(kreuzberg::KreuzbergError::Validation {
* message: "Empty image data".to_string(),
* source: None,
* });
* }
*
* let text = if fast_mode {
* "Fast OCR result".to_string()
* } else {
* format!("Extracted text in language: {}", config.language)
* };
*
* Ok(ExtractionResult {
* content: text,
* mime_type: Cow::Borrowed("text/plain"),
* ..Default::default()
* })
* }
* ```
*/
processImage(imageBytes: Uint8Array, config?: OcrConfig | undefined | null): Promise<string>
/**
* Process a file and extract text via OCR.
*
* Default implementation reads the file and calls `process_image`.
* Override for custom file handling or optimizations.
* @param path - Path to the image file
*
* @param config - OCR configuration
*
* @throws Same as `process_image`, plus file I/O errors.
*/
processImageFile?(path: string, config?: OcrConfig | undefined | null): Promise<string>
/**
* Check if this backend supports a given language code.
* @param lang - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
*
* @returns `true` if the language is supported, `false` otherwise.
*/
supportsLanguage(lang: string): string
/**
* Get the backend type identifier.
* @returns The backend type enum value.
*/
backendType(): string
/**
* Optional: Get a list of all supported languages.
*
* Defaults to empty list. Override to provide comprehensive language support info.
*/
supportedLanguages?(): string
/**
* Optional: Check if the backend supports table detection.
*
* Defaults to `false`. Override if your backend can detect and extract tables.
*/
supportsTableDetection?(): string
/**
* Check if the backend supports direct document-level processing (e.g. for PDFs).
*
* Defaults to `false`. Override if the backend has optimized document processing.
*/
supportsDocumentProcessing?(): string
/**
* Process a document file directly via OCR.
*
* Only called if `supports_document_processing` returns `true`.
* @param path - Path to the document file (e.g. .pdf)
*
* @param config - OCR configuration
*/
processDocument?(path: string, config?: OcrConfig | undefined | null): Promise<string>
}
/** OCR backend types. */
export declare enum OcrBackendType {
/** Tesseract OCR (native Rust binding) */
Tesseract = "Tesseract",
/** EasyOCR (Python-based, via FFI) */
EasyOCR = "EasyOCR",
/** PaddleOCR (Python-based, via FFI) */
PaddleOCR = "PaddleOCR",
/** Custom/third-party OCR backend */
Custom = "Custom",
}
/**
* Bounding geometry for an OCR element.
*
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
* (from PaddleOCR and rotated text detection).
*/
export type OcrBoundingGeometry =
| { type: 'rectangle'; left: number; top: number; width: number; height: number }
| { type: 'quadrilateral'; points: string }
/**
* Confidence scores for an OCR element.
*
* Separates detection confidence (how confident that text exists at this location)
* from recognition confidence (how confident about the actual text content).
*/
export interface OcrConfidence {
/**
* Detection confidence: how confident the OCR engine is that text exists here.
*
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
* Range: 0.0 to 1.0 (or None if not available).
*/
readonly detection?: number
/**
* Recognition confidence: how confident about the text content.
*
* Range: 0.0 to 1.0.
*/
readonly recognition?: number
}
/** OCR configuration. */
export interface OcrConfig {
/**
* Whether OCR is enabled.
*
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
* metadata only; PDFs use native text extraction without OCR fallback.
*
* Defaults to `true`. When `false`, all other OCR settings are ignored.
*/
readonly enabled?: boolean
/** OCR backend: tesseract, easyocr, paddleocr */
readonly backend?: string
/** Language code (e.g., "eng", "deu") */
readonly language?: string
/** Tesseract-specific configuration (optional) */
readonly tesseractConfig?: TesseractConfig
/** Output format for OCR results (optional, for format conversion) */
readonly outputFormat?: OutputFormat
/** PaddleOCR-specific configuration (optional, JSON passthrough) */
readonly paddleOcrConfig?: JsonValue
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Custom OCR backends and built-in backends that support runtime tuning
* can read this value and deserialize the keys they care about. Keys
* unknown to the backend are silently ignored.
*
* This is the recommended extension point for per-call parameters that
* are not covered by the typed fields above (e.g. mode switching,
* preprocessing flags, inference batch size).
*
* **Scope:** when `pipeline` is `None`, this value is propagated to the
* primary stage of the auto-constructed pipeline. When `pipeline` is
* explicitly set, this field has **no effect** the caller must set
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
* instead.
*
* Example:
* ```json
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
* ```
*/
readonly backendOptions?: JsonValue
/** OCR element extraction configuration */
readonly elementConfig?: OcrElementConfig
/**
* Quality thresholds for the native-text-to-OCR fallback decision.
* When None, uses compiled defaults (matching previous hardcoded behavior).
*/
readonly qualityThresholds?: OcrQualityThresholds
/**
* Multi-backend OCR pipeline configuration. When set, enables weighted
* fallback across multiple OCR backends based on output quality.
* When None, uses the single `backend` field (same as today).
*/
readonly pipeline?: OcrPipelineConfig
/**
* Enable automatic page rotation based on orientation detection.
*
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
* page orientation (0/90/180/270 degrees) before OCR. If the page is
* rotated with high confidence, the image is corrected before recognition.
* This is critical for handling rotated scanned documents.
*/
readonly autoRotate?: boolean
/**
* VLM (Vision Language Model) OCR configuration.
*
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
* images to a vision model for text extraction.
*/
readonly vlmConfig?: LlmConfig
/**
* Custom Jinja2 prompt template for VLM OCR.
*
* When `None`, uses the default template. Available variables:
* - `{{ language }}` The document language code (e.g., "eng", "deu").
*/
readonly vlmPrompt?: string
/**
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
*
* Not user-configurable via config files injected at runtime from
* `ExtractionConfig::acceleration` before each `process_image` call.
*/
readonly acceleration?: AccelerationConfig
/**
* Caller-supplied Tesseract `traineddata` bytes per language code.
*
* Primary use case is the WASM build, which has no filesystem and cannot
* download tessdata at runtime. Native builds typically rely on
* `TessdataManager` and ignore this field. When present, the WASM
* Tesseract backend prefers these bytes over its compile-time-bundled
* English data.
*
* Skipped by serde to keep config files small supply via the typed API
* at runtime.
*/
readonly tessdataBytes?: Record<string, Uint8Array>
}
/**
* A unified OCR element representing detected text with full metadata.
*
* This is the primary type for structured OCR output, preserving all information
* from both Tesseract and PaddleOCR backends.
*/
export interface OcrElement {
/** The recognized text content. */
readonly text?: string
/** Bounding geometry (rectangle or quadrilateral). */
readonly geometry?: OcrBoundingGeometry
/** Confidence scores for detection and recognition. */
readonly confidence?: OcrConfidence
/** Hierarchical level (word, line, block, page). */
readonly level?: OcrElementLevel
/** Rotation information (if detected). */
readonly rotation?: OcrRotation
/** Page number (1-indexed). */
readonly pageNumber?: number
/**
* Parent element ID for hierarchical relationships.
*
* Only used for Tesseract output which has word -> line -> block hierarchy.
*/
readonly parentId?: string
/** Backend-specific metadata that doesn't fit the unified schema. */
readonly backendMetadata?: Record<string, JsonValue>
}
/**
* Configuration for OCR element extraction.
*
* Controls how OCR elements are extracted and filtered.
*/
export interface OcrElementConfig {
/**
* Whether to include OCR elements in the extraction result.
*
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
*/
readonly includeElements?: boolean
/**
* Minimum hierarchical level to include.
*
* Elements below this level (e.g., words when min_level is Line) will be excluded.
*/
readonly minLevel?: OcrElementLevel
/**
* Minimum recognition confidence threshold (0.0-1.0).
*
* Elements with confidence below this threshold will be filtered out.
*/
readonly minConfidence?: number
/**
* Whether to build hierarchical relationships between elements.
*
* When true, `parent_id` fields will be populated based on spatial containment.
* Only meaningful for Tesseract output.
*/
readonly buildHierarchy?: boolean
}
/**
* Hierarchical level of an OCR element.
*
* Maps to Tesseract's page segmentation hierarchy and provides
* equivalent semantics for PaddleOCR.
*/
export declare enum OcrElementLevel {
/** Individual word */
Word = "word",
/** Line of text (default for PaddleOCR) */
Line = "line",
/** Paragraph or text block */
Block = "block",
/** Page-level element */
Page = "page",
}
/**
* OCR extraction result.
*
* Result of performing OCR on an image or scanned document,
* including recognized text and detected tables.
*/
export interface OcrExtractionResult {
/** Recognized text content */
readonly content: string
/** Original MIME type of the processed image */
readonly mimeType: string
/** OCR processing metadata (confidence scores, language, etc.) */
readonly metadata: Record<string, JsonValue>
/** Tables detected and extracted via OCR */
readonly tables: Array<OcrTable>
/**
* Structured OCR elements with bounding boxes and confidence scores.
* Available when TSV output is requested or table detection is enabled.
*/
readonly ocrElements?: Array<OcrElement>
/**
* Structured document produced from hOCR parsing.
* Carries paragraph structure, bounding boxes, and confidence scores
* that the flattened `content` string discards.
*/
readonly internalDocument?: string
}
/**
* OCR processing metadata.
*
* Captures information about OCR processing configuration and results.
*/
export interface OcrMetadata {
/** OCR language code(s) used */
readonly language?: string
/** Tesseract Page Segmentation Mode (PSM) */
readonly psm?: number
/** Output format (e.g., "text", "hocr") */
readonly outputFormat?: string
/** Number of tables detected */
readonly tableCount?: number
readonly tableRows?: number
readonly tableCols?: number
}
/**
* Multi-backend OCR pipeline with quality-based fallback.
*
* Backends are tried in priority order (highest first). After each backend
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
* the result is accepted. Otherwise the next backend is tried.
*/
export interface OcrPipelineConfig {
/** Ordered list of backends to try. Sorted by priority (descending) at runtime. */
readonly stages: Array<OcrPipelineStage>
/** Quality thresholds for deciding whether to accept a result or try the next backend. */
readonly qualityThresholds: OcrQualityThresholds
}
/** A single backend stage in the OCR pipeline. */
export interface OcrPipelineStage {
/** Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
readonly backend: string
/** Priority weight (higher = tried first). Stages are sorted by priority descending. */
readonly priority: number
/** Language override for this stage (None = use parent OcrConfig.language). */
readonly language?: string
/** Tesseract-specific config override for this stage. */
readonly tesseractConfig?: TesseractConfig
/** PaddleOCR-specific config for this stage. */
readonly paddleOcrConfig?: JsonValue
/** VLM config override for this pipeline stage. */
readonly vlmConfig?: LlmConfig
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Backends that support runtime tuning (mode switching, preprocessing
* flags, inference parameters, etc.) read this value and deserialize
* the keys they care about. Keys unknown to the backend are silently
* ignored, so options from different backends can coexist in the same
* config without conflict.
*
* Example (custom backend):
* ```json
* { "mode": "fast", "enable_layout": true }
* ```
*/
readonly backendOptions?: JsonValue
}
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so `OcrQualityThresholds::default()` preserves existing semantics exactly.
*/
export interface OcrQualityThresholds {
/** Minimum total non-whitespace characters to consider text substantive. */
readonly minTotalNonWhitespace?: number
/** Minimum non-whitespace characters per page on average. */
readonly minNonWhitespacePerPage?: number
/** Minimum character count for a word to be "meaningful". */
readonly minMeaningfulWordLen?: number
/** Minimum count of meaningful words before text is accepted. */
readonly minMeaningfulWords?: number
/** Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
readonly minAlnumRatio?: number
/** Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
readonly minGarbageChars?: number
/** Maximum fraction of short (1-2 char) words before text is considered fragmented. */
readonly maxFragmentedWordRatio?: number
/**
* Critical fragmentation threshold triggers OCR regardless of meaningful words.
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
*/
readonly criticalFragmentedWordRatio?: number
/** Minimum average word length. Below this with enough words indicates garbled extraction. */
readonly minAvgWordLength?: number
/** Minimum word count before average word length check applies. */
readonly minWordsForAvgLengthCheck?: number
/** Minimum consecutive word repetition ratio to detect column scrambling. */
readonly minConsecutiveRepeatRatio?: number
/** Minimum word count before consecutive repetition check is applied. */
readonly minWordsForRepeatCheck?: number
/** Minimum character count for "substantive markdown" OCR skip gate. */
readonly substantiveMinChars?: number
/** Minimum character count for "non-text content" OCR skip gate. */
readonly nonTextMinChars?: number
/** Alphanumeric+whitespace ratio threshold for skip decisions. */
readonly alnumWsRatioThreshold?: number
/**
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
* If the result from a backend scores below this, try the next backend.
*/
readonly pipelineMinQuality?: number
}
/** Rotation information for an OCR element. */
export interface OcrRotation {
/** Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
readonly angleDegrees: number
/** Confidence score for the rotation detection. */
readonly confidence?: number
}
/**
* Table detected via OCR.
*
* Represents a table structure recognized during OCR processing.
*/
export interface OcrTable {
/** Table cells as a 2D vector (rows × columns) */
readonly cells: Array<Array<string>>
/** Markdown representation of the table */
readonly markdown: string
/** Page number where the table was found (1-indexed) */
readonly pageNumber: number
/** Bounding box of the table in pixel coordinates (from OCR word positions). */
readonly boundingBox?: OcrTableBoundingBox
}
/** Bounding box for an OCR-detected table in pixel coordinates. */
export interface OcrTableBoundingBox {
/** Left x-coordinate (pixels) */
readonly left: number
/** Top y-coordinate (pixels) */
readonly top: number
/** Right x-coordinate (pixels) */
readonly right: number
/** Bottom y-coordinate (pixels) */
readonly bottom: number
}
/** Document orientation detection result. */
export interface OrientationResult {
/** Detected orientation in degrees (0, 90, 180, or 270). */
readonly degrees: number
/** Confidence score (0.0-1.0). */
readonly confidence: number
}
/**
* Output format for extraction results.
*
* Controls the format of the `content` field in `ExtractionResult`.
* When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
* `Plain` returns the raw extracted text.
* `Structured` returns JSON with full OCR element data including bounding
* boxes and confidence scores.
*/
export declare enum OutputFormat {
/** Plain text content only (default) */
Plain = "plain",
/** Markdown format */
Markdown = "markdown",
/** Djot markup format */
Djot = "djot",
/** HTML format */
Html = "html",
/** JSON tree format with heading-driven sections. */
Json = "json",
/** Structured JSON format with full OCR element metadata. */
Structured = "structured",
/**
* Custom renderer registered via the RendererRegistry.
* The string is the renderer name (e.g., "docx", "latex").
*/
Custom = "custom",
}
/**
* Supported languages in PaddleOCR.
*
* Maps user-friendly language codes to paddle-ocr-rs language identifiers.
*/
export declare enum PaddleLanguage {
/** English */
English = "English",
/** Simplified Chinese */
Chinese = "Chinese",
/** Japanese */
Japanese = "Japanese",
/** Korean */
Korean = "Korean",
/** German */
German = "German",
/** French */
French = "French",
/** Latin script (covers most European languages) */
Latin = "Latin",
/** Cyrillic (Russian and related) */
Cyrillic = "Cyrillic",
/** Traditional Chinese */
TraditionalChinese = "TraditionalChinese",
/** Thai */
Thai = "Thai",
/** Greek */
Greek = "Greek",
/** East Slavic (Russian, Ukrainian, Belarusian) */
EastSlavic = "EastSlavic",
/** Arabic (Arabic, Persian, Urdu) */
Arabic = "Arabic",
/** Devanagari (Hindi, Marathi, Sanskrit, Nepali) */
Devanagari = "Devanagari",
/** Tamil */
Tamil = "Tamil",
/** Telugu */
Telugu = "Telugu",
}
/**
* Configuration for PaddleOCR backend.
*
* Configures PaddleOCR text detection and recognition with multi-language support.
* Uses a builder pattern for convenient configuration.
*/
export interface PaddleOcrConfig {
/** Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
readonly language?: string
/** Optional custom cache directory for model files */
readonly cacheDir?: string
/**
* Enable angle classification for rotated text (default: false).
* Can misfire on short text regions, rotating crops incorrectly before recognition.
*/
readonly useAngleCls?: boolean
/** Enable table structure detection (default: false) */
readonly enableTableDetection?: boolean
/**
* Database threshold for text detection (default: 0.3)
* Range: 0.0-1.0, higher values require more confident detections
*/
readonly detDbThresh?: number
/**
* Box threshold for text bounding box refinement (default: 0.5)
* Range: 0.0-1.0
*/
readonly detDbBoxThresh?: number
/**
* Unclip ratio for expanding text bounding boxes (default: 1.6)
* Controls the expansion of detected text regions
*/
readonly detDbUnclipRatio?: number
/**
* Maximum side length for detection image (default: 960)
* Larger images may be resized to this limit for faster inference
*/
readonly detLimitSideLen?: number
/**
* Batch size for recognition inference (default: 6)
* Number of text regions to process simultaneously
*/
readonly recBatchNum?: number
/**
* Padding in pixels added around the image before detection (default: 10).
* Large values can include surrounding content like table gridlines.
*/
readonly padding?: number
/**
* Minimum recognition confidence score for text lines (default: 0.5).
* Text regions with recognition confidence below this threshold are discarded.
* Matches PaddleOCR Python's `drop_score` parameter.
* Range: 0.0-1.0
*/
readonly dropScore?: number
/**
* Model tier controlling detection/recognition model size and accuracy trade-off.
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
*/
readonly modelTier?: string
}
/**
* Byte offset boundary for a page.
*
* Tracks where a specific page's content starts and ends in the main content string,
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
*/
export interface PageBoundary {
/** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
readonly byteStart: number
/** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
readonly byteEnd: number
/** Page number (1-indexed) */
readonly pageNumber: number
}
/**
* Page extraction and tracking configuration.
*
* Controls how pages are extracted, tracked, and represented in the extraction results.
* When `None`, page tracking is disabled.
*
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
* when page boundaries are available and chunking is configured.
*/
export interface PageConfig {
/** Extract pages as separate array (ExtractionResult.pages) */
readonly extractPages?: boolean
/** Insert page markers in main content string */
readonly insertPageMarkers?: boolean
/**
* Page marker format (use {page_num} placeholder)
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
*/
readonly markerFormat?: string
}
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
export interface PageContent {
/** Page number (1-indexed) */
readonly pageNumber: number
/** Text content for this page */
readonly content: string
/**
* Tables found on this page (uses Arc for memory efficiency)
*
* Serializes as Vec<Table> for JSON compatibility while maintaining
* Arc semantics in-memory for zero-copy sharing.
*/
readonly tables: Array<Table>
/**
* Indices into `ExtractionResult.images` for images found on this page.
*
* Each value is a zero-based index into the top-level `images` collection.
* Only populated when `extract_images = true` in the extraction config.
*/
readonly imageIndices: Array<number>
/**
* Hierarchy information for the page (when hierarchy extraction is enabled)
*
* Contains text hierarchy levels (H1-H6) extracted from the page content.
*/
readonly hierarchy?: PageHierarchy
/**
* Whether this page is blank (no meaningful text content)
*
* Determined during extraction based on text content analysis.
* A page is blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images.
*/
readonly isBlank?: boolean
/**
* Layout detection regions for this page (when layout detection is enabled).
*
* Contains detected layout regions with class, confidence, bounding box,
* and area fraction. Only populated when layout detection is configured.
*/
readonly layoutRegions?: Array<LayoutRegion>
/**
* Speaker notes for this slide (PPTX only).
*
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
* Only populated when the source is a PPTX file and notes are present.
*/
readonly speakerNotes?: string
/**
* Section name this slide belongs to (PPTX only).
*
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
* the slide belongs to a named section.
*/
readonly sectionName?: string
/**
* Sheet name for this page (XLSX/ODS only).
*
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
* formats and for sheets with an empty name.
*/
readonly sheetName?: string
}
/**
* Page hierarchy structure containing heading levels and block information.
*
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
* blocks with heading levels (H1-H6) for semantic document structure.
*/
export interface PageHierarchy {
/** Number of hierarchy blocks on this page */
readonly blockCount: number
/** Hierarchical blocks with heading levels */
readonly blocks: Array<HierarchicalBlock>
}
/**
* Metadata for individual page/slide/sheet.
*
* Captures per-page information including dimensions, content counts,
* and visibility state (for presentations).
*/
export interface PageInfo {
/** Page number (1-indexed) */
readonly number: number
/** Page title (usually for presentations) */
readonly title?: string
/** Dimensions in points (PDF) or pixels (images): (width, height) */
readonly dimensions?: Array<number>
/** Number of images on this page */
readonly imageCount?: number
/** Number of tables on this page */
readonly tableCount?: number
/** Whether this page is hidden (e.g., in presentations) */
readonly hidden?: boolean
/**
* Whether this page is blank (no meaningful text, no images, no tables)
*
* A page is considered blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images. This is useful for filtering out empty pages
* in scanned documents or PDFs with blank separator pages.
*/
readonly isBlank?: boolean
/**
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
*
* Indicates the presence of vector-drawn content such as charts, diagrams,
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
* invisible to `ExtractionResult.images` since they are not embedded as raster
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
* signaling that downstream consumers may want to rasterize the page to
* capture this content.
*
* Only populated for PDFs; `None` for other document types.
*/
readonly hasVectorGraphics: boolean
}
/**
* Unified page structure for documents.
*
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
* with character offset boundaries for chunk-to-page mapping.
*/
export interface PageStructure {
/** Total number of pages/slides/sheets */
readonly totalCount: number
/** Type of paginated unit */
readonly unitType: PageUnitType
/**
* Character offset boundaries for each page
*
* Maps character ranges in the extracted content to page numbers.
* Used for chunk page range calculation.
*/
readonly boundaries?: Array<PageBoundary>
/** Detailed per-page metadata (optional, only when needed) */
readonly pages?: Array<PageInfo>
}
/**
* Type of paginated unit in a document.
*
* Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
*/
export declare enum PageUnitType {
/** Standard document pages (PDF, DOCX, images) */
Page = "page",
/** Presentation slides (PPTX, ODP) */
Slide = "slide",
/** Spreadsheet sheets (XLSX, ODS) */
Sheet = "sheet",
}
/** A PDF annotation extracted from a document page. */
export interface PdfAnnotation {
/** The type of annotation. */
readonly annotationType: PdfAnnotationType
/** Text content of the annotation (e.g., comment text, link URL). */
readonly content?: string
/** Page number where the annotation appears (1-indexed). */
readonly pageNumber: number
/** Bounding box of the annotation on the page. */
readonly boundingBox?: BoundingBox
}
/** Type of PDF annotation. */
export declare enum PdfAnnotationType {
/** Sticky note / text annotation */
Text = "text",
/** Highlighted text region */
Highlight = "highlight",
/** Hyperlink annotation */
Link = "link",
/** Rubber stamp annotation */
Stamp = "stamp",
/** Underline text markup */
Underline = "underline",
/** Strikeout text markup */
StrikeOut = "strike_out",
/** Any other annotation type */
Other = "other",
}
/** PDF-specific configuration. */
export interface PdfConfig {
/** Extract images from PDF */
readonly extractImages?: boolean
/**
* Extract tables from PDF.
*
* When `true` (default), runs pdf_oxide's native grid detector and, if it
* finds nothing, falls back to the heuristic text-layer reconstruction in
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
* both passes `tables` will then be empty in the result.
*/
readonly extractTables?: boolean
/** List of passwords to try when opening encrypted PDFs */
readonly passwords?: Array<string>
/** Extract PDF metadata */
readonly extractMetadata?: boolean
/** Hierarchy extraction configuration (None = hierarchy extraction disabled) */
readonly hierarchy?: HierarchyConfig
/**
* Extract PDF annotations (text notes, highlights, links, stamps).
* Default: false
*/
readonly extractAnnotations?: boolean
/**
* Top margin fraction (0.01.0) of page height to exclude headers/running heads.
* Default: 0.06 (6%)
*/
readonly topMarginFraction?: number
/**
* Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
* Default: 0.05 (5%)
*/
readonly bottomMarginFraction?: number
/**
* Allow single-column pseudo tables in extraction results.
*
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
* to 1, allowing single-column structured data (glossaries, itemized lists)
* to be emitted as tables. Other quality filters (density, sparsity, prose
* detection) still apply.
*/
readonly allowSingleColumnTables?: boolean
/**
* Perform OCR on inline images extracted from PDF pages and attach the
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
* falls back to `TesseractConfig::default()`. Per-image failures degrade
* gracefully (the image is returned without OCR text rather than failing
* the whole extraction). Default: `false`.
*/
readonly ocrInlineImages?: boolean
}
/**
* PDF-specific metadata.
*
* Contains metadata fields specific to PDF documents that are not in the common
* `Metadata` structure. Common fields like title, authors, keywords, and dates
* are at the `Metadata` level.
*/
export interface PdfMetadata {
/** PDF version (e.g., "1.7", "2.0") */
readonly pdfVersion?: string
/** PDF producer (application that created the PDF) */
readonly producer?: string
/** Whether the PDF is encrypted/password-protected */
readonly isEncrypted?: boolean
/** First page width in points (1/72 inch) */
readonly width?: number
/** First page height in points (1/72 inch) */
readonly height?: number
/** Total number of pages in the PDF document */
readonly pageCount?: number
}
/**
* Base trait that all plugins must implement.
*
* This trait provides common functionality for plugin lifecycle management,
* identification, and metadata.
*
* # Thread Safety
*
* All plugins must be `Send + Sync` to support concurrent usage across threads.
*/
export interface Plugin {
/**
* Returns the unique name/identifier for this plugin.
*
* The name should be:
* - Unique across all plugins
* - Lowercase with hyphens (e.g., "my-custom-plugin")
* - URL-safe characters only
*/
name(): string
/**
* Returns the semantic version of this plugin.
*
* Should follow semver format: `MAJOR.MINOR.PATCH`
*/
version?(): string
/**
* Initialize the plugin.
*
* Called once when the plugin is registered. Use this to:
* - Load configuration
* - Initialize resources (connections, caches, etc.)
* - Validate dependencies
*
* # Thread Safety
*
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
* Plugins needing mutable state during initialization should use interior mutability
* patterns (Mutex, RwLock, OnceCell, etc.).
* @throws Should return an error if initialization fails. The plugin will not be
* registered if this method returns an error.
*/
initialize?(): void
/**
* Shutdown the plugin.
*
* Called when the plugin is being unregistered or the application is shutting down.
* Use this to:
* - Close connections
* - Flush caches
* - Release resources
*
* # Thread Safety
*
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
* Plugins needing mutable state during shutdown should use interior mutability
* patterns (Mutex, RwLock, etc.).
* @throws Errors during shutdown are logged but don't prevent the shutdown process.
*/
shutdown?(): void
/**
* Optional plugin description for debugging and logging.
*
* Defaults to empty string if not overridden.
*/
description?(): string
/**
* Optional plugin author information.
*
* Defaults to empty string if not overridden.
*/
author?(): string
}
/**
* Trait for post-processor plugins.
*
* Post-processors transform or enrich extraction results after the initial
* extraction is complete. They can:
* - Clean and normalize text
* - Add metadata (language, keywords, entities)
* - Split content into chunks
* - Score quality
* - Apply custom transformations
*
* # Processing Order
*
* Post-processors are executed in stage order:
* 1. **Early** - Language detection, entity extraction
* 2. **Middle** - Keyword extraction, token reduction
* 3. **Late** - Custom hooks, final validation
*
* Within each stage, processors are executed in registration order.
*
* # Error Handling
*
* Post-processor errors are non-fatal by default - they're captured in metadata
* and execution continues. To make errors fatal, return an error from `process()`.
*
* # Thread Safety
*
* Post-processors must be thread-safe (`Send + Sync`).
*/
export interface PostProcessor {
name(): string
/**
* Process an extraction result.
*
* Transform or enrich the extraction result. Can modify:
* - `content` - The extracted text
* - `metadata` - Add or update metadata fields
* - `tables` - Modify or enhance table data
* @param result - Mutable reference to the extraction result to process
*
* @param config - Extraction configuration
*
* @returns `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
*
* @throws Return errors for fatal processing failures. Non-fatal errors should be
* captured in metadata directly on the result.
*
* # Performance
*
* This signature avoids unnecessary cloning of large extraction results by
* taking a mutable reference instead of ownership. Processors modify the
* result in place.
*
* # Example - Language Detection
*
* ```rust
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Detect language (simplified - use real detection library in practice)
* let language = "en"; // Placeholder detection
*
* // Add to metadata
* result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
*
* Ok(())
* }
* ```
*
* # Example - Text Cleaning
*
* ```rust
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Remove excessive whitespace
* result.content = result
* .content
* .split_whitespace()
* .collect::<Vec<_>>()
* .join(" ");
*
* Ok(())
* }
* ```
*/
process(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
/**
* Get the processing stage for this post-processor.
*
* Determines when this processor runs in the pipeline.
* @returns The `ProcessingStage` (Early, Middle, or Late).
*/
processingStage(): string
/**
* Optional: Check if this processor should run for a given result.
*
* Allows conditional processing based on MIME type, metadata, or content.
* Defaults to `true` (always run).
* @param result - The extraction result to check
*
* @param config - Extraction configuration
*
* @returns `true` if the processor should run, `false` to skip.
*/
shouldProcess?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
/**
* Optional: Estimate processing time in milliseconds.
*
* Used for logging and debugging. Defaults to 0 (unknown).
* @param result - The extraction result to estimate for
*
* @returns Estimated processing time in milliseconds.
*/
estimatedDurationMs?(result?: ExtractionResult | undefined | null): string
/**
* Execution priority within the processing stage.
*
* Higher values run first within the same `ProcessingStage`. Defaults to 50.
* Use 0-49 for fallback processors, 50 for normal processors, and 51-255
* for high-priority processors that should run early in their stage.
*/
priority?(): string
}
/** Post-processor configuration. */
export interface PostProcessorConfig {
/** Enable post-processors */
readonly enabled?: boolean
/** Whitelist of processor names to run (None = all enabled) */
readonly enabledProcessors?: Array<string>
/** Blacklist of processor names to skip (None = none disabled) */
readonly disabledProcessors?: Array<string>
/** Pre-computed AHashSet for O(1) enabled processor lookup */
readonly enabledSet?: Array<string>
/** Pre-computed AHashSet for O(1) disabled processor lookup */
readonly disabledSet?: Array<string>
}
/**
* Application properties from docProps/app.xml for PPTX
*
* Contains PowerPoint-specific document metadata.
*/
export interface PptxAppProperties {
/** Application name (e.g., "Microsoft Office PowerPoint") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Total editing time in minutes */
readonly totalTime?: number
/** Company name */
readonly company?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
/** Number of slides */
readonly slides?: number
/** Number of notes */
readonly notes?: number
/** Number of hidden slides */
readonly hiddenSlides?: number
/** Number of multimedia clips */
readonly multimediaClips?: number
/** Presentation format (e.g., "Widescreen", "Standard") */
readonly presentationFormat?: string
/** Slide titles */
readonly slideTitles?: Array<string>
}
/**
* PowerPoint (PPTX) extraction result.
*
* Contains extracted slide content, metadata, and embedded images/tables.
*/
export interface PptxExtractionResult {
/** Extracted text content from all slides */
readonly content: string
/** Presentation metadata */
readonly metadata: PptxMetadata
/** Total number of slides */
readonly slideCount: number
/** Total number of embedded images */
readonly imageCount: number
/** Total number of tables */
readonly tableCount: number
/** Extracted images from the presentation */
readonly images: Array<ExtractedImage>
/** Slide structure with boundaries (when page tracking is enabled) */
readonly pageStructure?: PageStructure
/** Per-slide content (when page tracking is enabled) */
readonly pageContents?: Array<PageContent>
/** Structured document representation */
readonly document?: DocumentStructure
/** Hyperlinks discovered in slides as (url, optional_label) pairs. */
readonly hyperlinks: Array<string>
/**
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
*
* Contains keys like "title", "author", "created_by", "subject", "keywords",
* "modified_by", "created_at", "modified_at", etc.
*/
readonly officeMetadata: Record<string, string>
/**
* Slide comments as revisions.
*
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
* `DocumentRevision { kind: Comment }` with author (resolved from
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
*/
readonly revisions?: Array<DocumentRevision>
}
/**
* PowerPoint presentation metadata.
*
* Extracted from PPTX files containing slide counts and presentation details.
*/
export interface PptxMetadata {
/** Total number of slides in the presentation */
readonly slideCount?: number
/** Names of slides (if available) */
readonly slideNames?: Array<string>
/** Number of embedded images */
readonly imageCount?: number
/** Number of tables */
readonly tableCount?: number
}
/**
* Processing stages for post-processors.
*
* Post-processors are executed in stage order (Early Middle Late).
* Use stages to control the order of post-processing operations.
*/
export declare enum ProcessingStage {
/**
* Early stage - foundational processing.
*
* Use for:
* - Language detection
* - Character encoding normalization
* - Entity extraction (NER)
* - Text quality scoring
*/
Early = "Early",
/**
* Middle stage - content transformation.
*
* Use for:
* - Keyword extraction
* - Token reduction
* - Text summarization
* - Semantic analysis
*/
Middle = "Middle",
/**
* Late stage - final enrichment.
*
* Use for:
* - Custom user hooks
* - Analytics/logging
* - Final validation
* - Output formatting
*/
Late = "Late",
}
/**
* A non-fatal warning from a processing pipeline stage.
*
* Captures errors from optional features that don't prevent extraction
* but may indicate degraded results.
*/
export interface ProcessingWarning {
/**
* The pipeline stage or feature that produced this warning
* (e.g., "embedding", "chunking", "language_detection", "output_format").
*/
readonly source: string
/** Human-readable description of what went wrong. */
readonly message: string
}
/** Page Segmentation Mode for Tesseract OCR */
export declare enum PSMMode {
OsdOnly = "OsdOnly",
AutoOsd = "AutoOsd",
AutoOnly = "AutoOnly",
Auto = "Auto",
SingleColumn = "SingleColumn",
SingleBlockVertical = "SingleBlockVertical",
SingleBlock = "SingleBlock",
SingleLine = "SingleLine",
SingleWord = "SingleWord",
CircleWord = "CircleWord",
SingleChar = "SingleChar",
}
/** Outlook PST archive metadata. */
export interface PstMetadata {
readonly messageCount?: number
}
/** RAKE-specific parameters. */
export interface RakeParams {
/** Minimum word length to consider (default: 1). */
readonly minWordLength?: number
/** Maximum words in a keyword phrase (default: 3). */
readonly maxWordsPerPhrase?: number
}
/**
* Pre-computed table markdown for a table detection region.
*
* Produced by the TATR-based table structure recognizer and surfaced as part of
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
* so that consumers who do not enable `layout-detection` (ORT) can still reference
* the type in their own code.
*/
export interface RecognizedTable {
/** Detection bbox that this table corresponds to (for matching). */
readonly detectionBbox: BBox
/** Table cells as a 2D vector (rows × columns). */
readonly cells: Array<Array<string>>
/** Rendered markdown table. */
readonly markdown: string
}
export declare enum ReductionLevel {
Off = "Off",
Light = "Light",
Moderate = "Moderate",
Aggressive = "Aggressive",
Maximum = "Maximum",
}
/** Semantic kind of a relationship between document elements. */
export declare enum RelationshipKind {
/** Footnote marker -> footnote definition. */
FootnoteReference = "footnote_reference",
/** Citation marker -> bibliography entry. */
CitationReference = "citation_reference",
/** Internal anchor link (`#id`) -> target heading/element. */
InternalLink = "internal_link",
/** Caption paragraph -> figure/table it describes. */
Caption = "caption",
/** Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`). */
Label = "label",
/** TOC entry -> target section. */
TocEntry = "toc_entry",
/** Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field). */
CrossReference = "cross_reference",
}
/**
* Trait for document renderers that convert [`InternalDocument`] to output strings.
*
* Renderers are typically stateless converters that transform the internal
* document representation into a specific output format (Markdown, HTML,
* Djot, plain text, etc.). They participate in the standard [`Plugin`]
* lifecycle so custom renderers can be registered from any supported binding
* language.
*
* The format name is exposed via [`Plugin::name`]. For stateless renderers
* the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
* take no-op defaults and need not be overridden.
*
* # Thread Safety
*
* Renderers must be `Send + Sync` (inherited from [`Plugin`]).
*/
export interface Renderer {
name(): string
/**
* Render an [`InternalDocument`] to the output format.
* @param doc - The internal document to render
*
* @returns The rendered output as a string.
*
* @throws Returns an error if rendering fails.
*/
render(doc: InternalDocument): string
}
/**
* Result-shape selection for extraction results.
*
* Distinct from `OutputFormat` (which controls rendering Plain, Markdown,
* HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
* blob vs. an element-based decomposition.
*/
export declare enum ResultFormat {
/** Unified format with all content in `content` field */
Unified = "unified",
/** Element-based format with semantic element extraction */
ElementBased = "element_based",
}
/** Best-effort document location for a revision. */
export type RevisionAnchor =
| { type: 'paragraph'; index: number }
| { type: 'table_cell'; row: number; col: number; tableIndex: number }
| { type: 'page'; index: number }
| { type: 'slide'; index: number }
| { type: 'sheet'; index: number; name: string }
/**
* The content changes that make up a single revision.
*
* For insertions and deletions the `content` field carries the added/removed
* lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
* changes, `content` is empty the property diff is left as a TODO for a
* later enrichment pass.
*/
export interface RevisionDelta {
/** Line-level content changes for this revision. */
readonly content?: Array<DiffLine>
/** Cell-level table changes for this revision. */
readonly tableChanges?: Array<CellChange>
}
/** Semantic classification of a tracked change. */
export declare enum RevisionKind {
/** Text or content was inserted. */
Insertion = "insertion",
/** Text or content was deleted. */
Deletion = "deletion",
/** Run-level formatting (font, size, colour, …) was changed. */
FormatChange = "format_change",
/** A reviewer comment or annotation. */
Comment = "comment",
}
/**
* Configuration for security limits across extractors.
*
* All limits are intentionally conservative to prevent DoS attacks
* while still supporting legitimate documents.
*/
export interface SecurityLimits {
/** Maximum uncompressed size for archives (500 MB) */
readonly maxArchiveSize?: number
/** Maximum compression ratio before flagging as potential bomb (100:1) */
readonly maxCompressionRatio?: number
/** Maximum number of files in archive (10,000) */
readonly maxFilesInArchive?: number
/** Maximum nesting depth for structures (100) */
readonly maxNestingDepth?: number
/**
* Maximum length of any single XML entity / attribute / token (1 MiB).
* This is a per-token cap, NOT a total cap billion-laughs class
* attacks where a single entity expands to hundreds of MB are caught
* here, while normal long text content (a paragraph, a CDATA block) is
* caught by `max_content_size` instead.
*/
readonly maxEntityLength?: number
/** Maximum string growth per document (100 MB) */
readonly maxContentSize?: number
/** Maximum iterations per operation */
readonly maxIterations?: number
/** Maximum XML depth (100 levels) */
readonly maxXmlDepth?: number
/** Maximum cells per table (100,000) */
readonly maxTableCells?: number
}
/**
* API server configuration.
*
* This struct holds all configuration options for the Kreuzberg API server,
* including host/port settings, CORS configuration, and upload limits.
*
* # Defaults
*
* - `host`: "127.0.0.1" (localhost only)
* - `port`: 8000
* - `cors_origins`: empty vector (allows all origins)
* - `max_request_body_bytes`: 104_857_600 (100 MB)
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
*/
export interface ServerConfig {
/** Server host address (e.g., "127.0.0.1", "0.0.0.0") */
readonly host?: string
/** Server port number */
readonly port?: number
/**
* CORS allowed origins. Empty vector means allow all origins.
*
* If this is an empty vector, the server will accept requests from any origin.
* If populated with specific origins (e.g., `"https://example.com"`), only
* those origins will be allowed.
*/
readonly corsOrigins?: Array<string>
/** Maximum size of request body in bytes (default: 100 MB) */
readonly maxRequestBodyBytes?: number
/** Maximum size of multipart fields in bytes (default: 100 MB) */
readonly maxMultipartFieldBytes?: number
}
/** Structured data (Schema.org, microdata, RDFa) block. */
export interface StructuredData {
/** Type of structured data */
readonly dataType: StructuredDataType
/** Raw JSON string representation */
readonly rawJson: string
/** Schema type if detectable (e.g., "Article", "Event", "Product") */
readonly schemaType?: string
}
export interface StructuredDataResult {
readonly content: string
readonly format: string
readonly metadata: Record<string, string>
readonly textFields: Array<string>
}
/** Structured data type classification. */
export declare enum StructuredDataType {
/** JSON-LD structured data */
JsonLd = "json-ld",
/** Microdata */
Microdata = "microdata",
/** RDFa */
RDFa = "rdfa",
}
/**
* Configuration for LLM-based structured data extraction.
*
* Sends extracted document content to a VLM with a JSON schema,
* returning structured data that conforms to the schema.
* @example
* ```typescript
* [structured_extraction]
* schema_name = "invoice_data"
* strict = true
*
* [structured_extraction.schema]
* type = "object"
* properties.vendor = { type = "string" }
* properties.total = { type = "number" }
* required = ["vendor", "total"]
*
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* ```typescript
*/
export interface StructuredExtractionConfig {
/** JSON Schema defining the desired output structure. */
readonly schema: JsonValue
/** Schema name passed to the LLM's structured output mode. */
readonly schemaName: string
/** Optional schema description for the LLM. */
readonly schemaDescription?: string
/** Enable strict mode — output must exactly match the schema. */
readonly strict: boolean
/**
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
*
* Available template variables:
* - `{{ content }}` The extracted document text.
* - `{{ schema }}` The JSON schema as a formatted string.
* - `{{ schema_name }}` The schema name.
* - `{{ schema_description }}` The schema description (may be empty).
*/
readonly prompt?: string
/** LLM configuration for the extraction. */
readonly llm: LlmConfig
}
/**
* A supported document format entry.
*
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
*/
export interface SupportedFormat {
/** File extension (without leading dot), e.g., "pdf", "docx" */
readonly extension: string
/** MIME type string, e.g., "application/pdf" */
readonly mimeType: string
}
/**
* Extracted table structure.
*
* Represents a table detected and extracted from a document (PDF, image, etc.).
* Tables are converted to both structured cell data and Markdown format.
*/
export interface Table {
/** Table cells as a 2D vector (rows × columns) */
readonly cells?: Array<Array<string>>
/** Markdown representation of the table */
readonly markdown?: string
/** Page number where the table was found (1-indexed) */
readonly pageNumber?: number
/**
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted tables when position data is available.
*/
readonly boundingBox?: BoundingBox
}
/**
* Individual table cell with content and optional styling.
*
* Future extension point for rich table support with cell-level metadata.
*/
export interface TableCell {
/** Cell content as text */
readonly content?: string
/** Row span (number of rows this cell spans) */
readonly rowSpan?: number
/** Column span (number of columns this cell spans) */
readonly colSpan?: number
/** Whether this is a header cell */
readonly isHeader?: boolean
}
/** Cell-level changes for a pair of tables that share the same index. */
export interface TableDiff {
/** Zero-based index of the table in both `a.tables` and `b.tables`. */
readonly fromIndex: number
/** Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
readonly toIndex: number
/** Cell-level changes within the table. */
readonly cellChanges: Array<CellChange>
}
/**
* Structured table grid with cell-level metadata.
*
* Stores row/column dimensions and a flat list of cells with position info.
*/
export interface TableGrid {
/** Number of rows in the table. */
readonly rows?: number
/** Number of columns in the table. */
readonly cols?: number
/** All cells in row-major order. */
readonly cells?: Array<GridCell>
}
/**
* Which table structure recognition model to use.
*
* Controls the model used for table cell detection within layout-detected
* table regions. Wire format is snake_case in all serializers (JSON, TOML,
* YAML).
*/
export declare enum TableModel {
/** TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection. */
Tatr = "tatr",
/** SLANeXT wired variant -- 365MB, optimized for bordered tables. */
SlanetWired = "slanet_wired",
/** SLANeXT wireless variant -- 365MB, optimized for borderless tables. */
SlanetWireless = "slanet_wireless",
/** SLANet-plus -- 7.78MB, lightweight general-purpose. */
SlanetPlus = "slanet_plus",
/**
* Classifier-routed SLANeXT: auto-select wired/wireless per table.
* Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
*/
SlanetAuto = "slanet_auto",
/** Disable table structure model inference entirely; use heuristic path only. */
Disabled = "disabled",
}
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
export interface TesseractConfig {
/** Language code (e.g., "eng", "deu", "fra") */
readonly language?: string
/**
* Page Segmentation Mode (0-13).
*
* Common values:
* - 3: Fully automatic page segmentation (native default)
* - 6: Assume a single uniform block of text (WASM default avoids layout-analysis hang)
* - 11: Sparse text with no particular order
*/
readonly psm?: number
/** Output format ("text" or "markdown") */
readonly outputFormat?: string
/**
* OCR Engine Mode (0-3).
*
* - 0: Legacy engine only
* - 1: Neural nets (LSTM) only (usually best)
* - 2: Legacy + LSTM
* - 3: Default (based on what's available)
*/
readonly oem?: number
/**
* Minimum confidence threshold (0.0-100.0).
*
* Words with confidence below this threshold may be rejected or flagged.
*/
readonly minConfidence?: number
/**
* Image preprocessing configuration.
*
* Controls how images are preprocessed before OCR. Can significantly
* improve quality for scanned documents or low-quality images.
*/
readonly preprocessing?: ImagePreprocessingConfig
/** Enable automatic table detection and reconstruction */
readonly enableTableDetection?: boolean
/** Minimum confidence threshold for table detection (0.0-1.0) */
readonly tableMinConfidence?: number
/** Column threshold for table detection (pixels) */
readonly tableColumnThreshold?: number
/** Row threshold ratio for table detection (0.0-1.0) */
readonly tableRowThresholdRatio?: number
/** Enable OCR result caching */
readonly useCache?: boolean
/** Use pre-adapted templates for character classification */
readonly classifyUsePreAdaptedTemplates?: boolean
/** Enable N-gram language model */
readonly languageModelNgramOn?: boolean
/** Don't reject good words during block-level processing */
readonly tesseditDontBlkrejGoodWds?: boolean
/** Don't reject good words during row-level processing */
readonly tesseditDontRowrejGoodWds?: boolean
/** Enable dictionary correction */
readonly tesseditEnableDictCorrection?: boolean
/** Whitelist of allowed characters (empty = all allowed) */
readonly tesseditCharWhitelist?: string
/** Blacklist of forbidden characters (empty = none forbidden) */
readonly tesseditCharBlacklist?: string
/** Use primary language params model */
readonly tesseditUsePrimaryParamsModel?: boolean
/** Variable-width space detection */
readonly textordSpaceSizeIsVariable?: boolean
/** Use adaptive thresholding method */
readonly thresholdingMethod?: boolean
}
/**
* Inline text annotation byte-range based formatting and links.
*
* Annotations reference byte offsets into the node's text content,
* enabling precise identification of formatted regions.
*/
export interface TextAnnotation {
/** Start byte offset in the node's text content (inclusive). */
readonly start: number
/** End byte offset in the node's text content (exclusive). */
readonly end: number
/** Annotation type. */
readonly kind: AnnotationKind
}
/** Text direction enumeration for HTML documents. */
export declare enum TextDirection {
/** Left-to-right text direction */
LeftToRight = "ltr",
/** Right-to-left text direction */
RightToLeft = "rtl",
/** Automatic text direction detection */
Auto = "auto",
}
/**
* Plain text and Markdown extraction result.
*
* Contains the extracted text along with statistics and,
* for Markdown files, structural elements like headers and links.
*/
export interface TextExtractionResult {
/** Extracted text content */
readonly content: string
/** Number of lines */
readonly lineCount: number
/** Number of words */
readonly wordCount: number
/** Number of characters */
readonly characterCount: number
/** Markdown headers (text only, Markdown files only) */
readonly headers?: Array<string>
/** Markdown links as (text, URL) tuples (Markdown files only) */
readonly links?: Array<Array<string>>
/** Code blocks as (language, code) tuples (Markdown files only) */
readonly codeBlocks?: Array<Array<string>>
}
/**
* Text/Markdown metadata.
*
* Extracted from plain text and Markdown files. Includes word counts and,
* for Markdown, structural elements like headers and links.
*/
export interface TextMetadata {
/** Number of lines in the document */
readonly lineCount?: number
/** Number of words */
readonly wordCount?: number
/** Number of characters */
readonly characterCount?: number
/** Markdown headers (headings text only, for Markdown files) */
readonly headers?: Array<string>
/** Markdown links as (text, url) tuples (for Markdown files) */
readonly links?: Array<Array<string>>
/** Code blocks as (language, code) tuples (for Markdown files) */
readonly codeBlocks?: Array<Array<string>>
}
export interface TokenReductionConfig {
readonly level?: ReductionLevel
readonly languageHint?: string
readonly preserveMarkdown?: boolean
readonly preserveCode?: boolean
readonly semanticThreshold?: number
readonly enableParallel?: boolean
readonly useSimd?: boolean
readonly customStopwords?: Record<string, Array<string>>
readonly preservePatterns?: Array<string>
readonly targetReduction?: number
readonly enableSemanticClustering?: boolean
}
/** Token reduction configuration. */
export interface TokenReductionOptions {
/** Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
readonly mode?: string
/** Preserve important words (capitalized, technical terms) */
readonly preserveImportantWords?: boolean
}
/**
* Configuration for tree-sitter language pack integration.
*
* Controls grammar download behavior and code analysis options.
*
* # Example (TOML)
*
* ```toml
* [tree_sitter]
* languages = ["python", "rust"]
* groups = ["web"]
*
* [tree_sitter.process]
* structure = true
* comments = true
* docstrings = true
* ```
*/
export interface TreeSitterConfig {
/**
* Enable code intelligence processing (default: true).
*
* When `false`, tree-sitter analysis is completely skipped even if
* the config section is present.
*/
readonly enabled?: boolean
/**
* Custom cache directory for downloaded grammars.
*
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
*/
readonly cacheDir?: string
/** Languages to pre-download on init (e.g., `["python", "rust"]`). */
readonly languages?: Array<string>
/** Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
readonly groups?: Array<string>
/** Processing options for code analysis. */
readonly process?: TreeSitterProcessConfig
}
/**
* Processing options for tree-sitter code analysis.
*
* Controls which analysis features are enabled when extracting code files.
*/
export interface TreeSitterProcessConfig {
/** Extract structural items (functions, classes, structs, etc.). Default: true. */
readonly structure?: boolean
/** Extract import statements. Default: true. */
readonly imports?: boolean
/** Extract export statements. Default: true. */
readonly exports?: boolean
/** Extract comments. Default: false. */
readonly comments?: boolean
/** Extract docstrings. Default: false. */
readonly docstrings?: boolean
/** Extract symbol definitions. Default: false. */
readonly symbols?: boolean
/** Include parse diagnostics. Default: false. */
readonly diagnostics?: boolean
/** Maximum chunk size in bytes. `None` disables chunking. */
readonly chunkMaxSize?: number
/** Content rendering mode for code extraction. */
readonly contentMode?: CodeContentMode
}
/** Semantic classification of an extracted URI. */
export declare enum UriKind {
/** A clickable hyperlink (web URL, file link). */
Hyperlink = "hyperlink",
/** An image or media resource reference. */
Image = "image",
/** An internal anchor or cross-reference target. */
Anchor = "anchor",
/** A citation or bibliographic reference (DOI, academic ref). */
Citation = "citation",
/** A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST). */
Reference = "reference",
/** An email address (`mailto:` link or bare email). */
Email = "email",
}
/**
* Trait for validator plugins.
*
* Validators check extraction results for quality, completeness, or correctness.
* Unlike post-processors, validator errors **fail fast** - if a validator returns
* an error, the extraction fails immediately.
*
* # Use Cases
*
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
* - **Compliance**: Verify content meets regulatory requirements
* - **Content Filtering**: Reject documents containing unwanted content
* - **Format Validation**: Verify extracted content structure
* - **Security Checks**: Scan for malicious content
*
* # Error Handling
*
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
* to the caller. Use validators for hard requirements that must be met.
*
* For non-fatal checks, use post-processors instead.
*
* # Thread Safety
*
* Validators must be thread-safe (`Send + Sync`).
*/
export interface Validator {
name(): string
/**
* Validate an extraction result.
*
* Check the extraction result and return `Ok(())` if valid, or an error
* if validation fails.
* @param result - The extraction result to validate
*
* @param config - Extraction configuration
*
* @returns - `Ok(())` if validation passes
* - `Err(...)` if validation fails (extraction will fail)
*
* @throws - `KreuzbergError::Validation` - Validation failed
* - Any other error type appropriate for the failure
*
* # Example - Content Length Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* let length = result.content.len();
*
* if length < self.min {
* return Err(KreuzbergError::validation(format!(
* "Content too short: {} < {} characters",
* length, self.min
* )));
* }
*
* if length > self.max {
* return Err(KreuzbergError::validation(format!(
* "Content too long: {} > {} characters",
* length, self.max
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Quality Score Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check if quality_score exists in metadata
* let score = result.metadata
* .additional
* .get("quality_score")
* .and_then(|v| v.as_f64())
* .unwrap_or(0.0);
*
* if score < self.min_score {
* return Err(KreuzbergError::validation(format!(
* "Quality score too low: {} < {}",
* score, self.min_score
* )));
* }
*
* Ok(())
* }
* ```
*
* # Example - Security Validation
*
* ```rust
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Check for blocked patterns
* for pattern in &self.blocked_patterns {
* if result.content.contains(pattern) {
* return Err(KreuzbergError::validation(format!(
* "Content contains blocked pattern: {}",
* pattern
* )));
* }
* }
*
* Ok(())
* }
* ```
*/
validate(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
/**
* Optional: Check if this validator should run for a given result.
*
* Allows conditional validation based on MIME type, metadata, or content.
* Defaults to `true` (always run).
* @param result - The extraction result to check
*
* @param config - Extraction configuration
*
* @returns `true` if the validator should run, `false` to skip.
*/
shouldValidate?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
/**
* Optional: Get the validation priority.
*
* Higher priority validators run first. Useful for ordering validation checks
* (e.g., run cheap validations before expensive ones).
*
* Default priority is 50.
* @returns Priority value (higher = runs earlier).
*/
priority?(): string
}
/**
* Application properties from docProps/app.xml for XLSX
*
* Contains Excel-specific document metadata.
*/
export interface XlsxAppProperties {
/** Application name (e.g., "Microsoft Excel") */
readonly application?: string
/** Application version */
readonly appVersion?: string
/** Document security level */
readonly docSecurity?: number
/** Scale crop flag */
readonly scaleCrop?: boolean
/** Links up to date flag */
readonly linksUpToDate?: boolean
/** Shared document flag */
readonly sharedDoc?: boolean
/** Hyperlinks changed flag */
readonly hyperlinksChanged?: boolean
/** Company name */
readonly company?: string
/** Worksheet names */
readonly worksheetNames?: Array<string>
}
/**
* XML extraction result.
*
* Contains extracted text content from XML files along with
* structural statistics about the XML document.
*/
export interface XmlExtractionResult {
/** Extracted text content (XML structure filtered out) */
readonly content: string
/** Total number of XML elements processed */
readonly elementCount: number
/** List of unique element names found (sorted) */
readonly uniqueElements: Array<string>
}
/**
* XML metadata extracted during XML parsing.
*
* Provides statistics about XML document structure.
*/
export interface XmlMetadata {
/** Total number of XML elements processed */
readonly elementCount?: number
/** List of unique element tag names (sorted) */
readonly uniqueElements?: Array<string>
}
/** YAKE-specific parameters. */
export interface YakeParams {
/**
* Window size for co-occurrence analysis (default: 2).
*
* Controls the context window for computing co-occurrence statistics.
*/
readonly windowSize?: number
}
/** Year range for bibliographic metadata. */
export interface YearRange {
readonly min?: number
readonly max?: number
readonly years: Array<number>
}
/** List names of all registered document extractors. */
export declare function listDocumentExtractors(): Array<string>;
/**
* List the names of all registered embedding backends.
*
* Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
* bindings.
*/
export declare function listEmbeddingBackends(): Array<string>;
/**
* List the names of all available embedding presets.
*
* Returns owned `String`s so the values are safe to pass across FFI boundaries.
*/
export declare function listEmbeddingPresets(): Array<string>;
/**
* List all registered OCR backends.
*
* Returns the names of all OCR backends currently registered in the global registry.
* @returns A vector of OCR backend names.
*/
export declare function listOcrBackends(): Array<string>;
/**
* List all registered post-processor names.
*
* Returns a vector of all post-processor names currently registered in the
* global registry.
* @returns - `Ok(Vec<String>)` - Vector of post-processor names
* - `Err(...)` if the registry lock is poisoned
*/
export declare function listPostProcessors(): Array<string>;
/**
* List names of all registered renderers.
* @throws Returns an error if the registry lock is poisoned.
*/
export declare function listRenderers(): Array<string>;
/** List names of all registered validators. */
export declare function listValidators(): Array<string>;
export declare function registerDocumentExtractor(impl: DocumentExtractor): void;
export declare function registerEmbeddingBackend(impl: EmbeddingBackend): void;
export declare function registerOcrBackend(impl: OcrBackend): void;
export declare function registerPostProcessor(impl: PostProcessor): void;
export declare function registerRenderer(impl: Renderer): void;
export declare function registerValidator(impl: Validator): void;
/**
* Render a single PDF page to PNG bytes.
*
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
* @param pdf_bytes - Raw PDF file bytes
*
* @param page_index - Zero-based page index
*
* @param dpi - Resolution in dots per inch (default: 150)
*
* @param password - Optional password for encrypted PDFs
*
* @throws Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
* or rendered, or if `page_index` is out of range.
*/
export declare function renderPdfPageToPng(pdfBytes: Uint8Array, pageIndex: number, dpi?: number | undefined | null, password?: string | undefined | null): Uint8Array;
export declare function unregisterDocumentExtractor(name: string): void;
export declare function unregisterEmbeddingBackend(name: string): void;
export declare function unregisterOcrBackend(name: string): void;
export declare function unregisterPostProcessor(name: string): void;
export declare function unregisterRenderer(name: string): void;
export declare function unregisterValidator(name: string): void;