5489 lines
189 KiB
TypeScript
Generated
5489 lines
189 KiB
TypeScript
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
// To regenerate: alef generate
|
||
// To verify freshness: alef verify --exit-code
|
||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
/* eslint-disable */
|
||
|
||
export type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue };
|
||
|
||
/**
|
||
* Extract content from multiple byte arrays concurrently.
|
||
*
|
||
* This function processes multiple byte arrays in parallel, automatically managing
|
||
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
* to `(num_cpus * 1.5).ceil()`.
|
||
*
|
||
* Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
* fields from the batch-level `config`. Pass `None` as the config to use
|
||
* the batch-level defaults for that item.
|
||
* @param items - Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides.
|
||
*
|
||
* @param config - Batch-level extraction configuration
|
||
*
|
||
* @returns A vector of `ExtractionResult` in the same order as the input items.
|
||
*/
|
||
export declare function batchExtractBytes(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
|
||
|
||
/**
|
||
* Synchronous wrapper for `batch_extract_bytes`.
|
||
*
|
||
* Uses the global Tokio runtime for optimal performance.
|
||
* With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
||
* that iterates through items and calls `extract_bytes_sync()`.
|
||
*/
|
||
export declare function batchExtractBytesSync(items: Array<BatchBytesItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
|
||
|
||
/**
|
||
* Extract content from multiple files concurrently.
|
||
*
|
||
* This function processes multiple files in parallel, automatically managing
|
||
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
* to `(num_cpus * 1.5).ceil()`.
|
||
*
|
||
* Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
* fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
||
* Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
||
* taken from the batch-level `config`.
|
||
* @param items - Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides.
|
||
*
|
||
* @param config - Batch-level extraction configuration (provides defaults and batch settings)
|
||
*
|
||
* @returns A vector of `ExtractionResult` in the same order as the input items.
|
||
*
|
||
* @throws Individual file errors are captured in the result metadata. System errors
|
||
* (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
||
*/
|
||
export declare function batchExtractFiles(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Promise<Array<ExtractionResult>>;
|
||
|
||
/**
|
||
* Synchronous wrapper for `batch_extract_files`.
|
||
*
|
||
* Uses the global Tokio runtime for optimal performance.
|
||
* Only available with `tokio-runtime` (WASM has no filesystem).
|
||
*/
|
||
export declare function batchExtractFilesSync(items: Array<BatchFileItem>, config?: ExtractionConfig | undefined | null): Array<ExtractionResult>;
|
||
|
||
/**
|
||
* Clear all document extractors from the global registry.
|
||
*
|
||
* Calls `shutdown()` on every registered extractor, then empties the registry.
|
||
* @throws - Any error returned by an extractor's `shutdown()` method. The first error
|
||
* encountered stops processing of remaining extractors.
|
||
*/
|
||
export declare function clearDocumentExtractors(): void;
|
||
|
||
/**
|
||
* Clear all embedding backends from the global registry.
|
||
*
|
||
* Calls `shutdown()` on every registered backend, then empties the registry.
|
||
* @throws - Any error returned by a backend's `shutdown()` method. The first error
|
||
* encountered stops processing of remaining backends.
|
||
*/
|
||
export declare function clearEmbeddingBackends(): void;
|
||
|
||
/**
|
||
* Clear all OCR backends from the global registry.
|
||
*
|
||
* Removes all OCR backends and calls their `shutdown()` methods.
|
||
* @returns - `Ok(())` if all backends were cleared successfully
|
||
* - `Err(...)` if any shutdown method failed
|
||
*/
|
||
export declare function clearOcrBackends(): void;
|
||
|
||
/** Remove all registered post-processors. */
|
||
export declare function clearPostProcessors(): void;
|
||
|
||
/**
|
||
* Clear all renderers from the global registry.
|
||
*
|
||
* Removes every renderer, including the built-in defaults (markdown, html,
|
||
* djot, plain). After calling this no renderers are registered; re-register
|
||
* as needed.
|
||
* @throws Returns an error if the registry lock is poisoned.
|
||
*/
|
||
export declare function clearRenderers(): void;
|
||
|
||
/** Remove all registered validators. */
|
||
export declare function clearValidators(): void;
|
||
|
||
/**
|
||
* Compare two extraction results and return a structured diff.
|
||
*
|
||
* The comparison is purely structural — no I/O, no side effects. All fields
|
||
* of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||
* @param a - — the "before" extraction result
|
||
*
|
||
* @param b - — the "after" extraction result
|
||
*
|
||
* @param opts - — controls which sections are compared and optional truncation
|
||
*/
|
||
export declare function compare(a?: ExtractionResult | undefined | null, b?: ExtractionResult | undefined | null, opts?: DiffOptions | undefined | null): ExtractionDiff;
|
||
|
||
/**
|
||
* Detect the MIME type of a file at the given path.
|
||
*
|
||
* Uses the file extension and optionally the file content to determine the MIME type.
|
||
* Set `check_exists` to `true` to verify the file exists before detection.
|
||
*/
|
||
export declare function detectMimeType(path: string, checkExists: boolean): string;
|
||
|
||
/**
|
||
* Detect MIME type from raw file bytes.
|
||
*
|
||
* Uses magic byte signatures to detect file type from content.
|
||
* Falls back to `infer` crate for comprehensive detection.
|
||
*
|
||
* For ZIP-based files, inspects contents to distinguish Office Open XML
|
||
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
||
* @param content - Raw file bytes
|
||
*
|
||
* @returns The detected MIME type string.
|
||
*
|
||
* @throws Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
||
*/
|
||
export declare function detectMimeTypeFromBytes(content: Uint8Array): string;
|
||
|
||
/**
|
||
* Embed a list of texts using the configured embedding model.
|
||
*
|
||
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
||
*/
|
||
export declare function embedTexts(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Array<Array<number>>;
|
||
|
||
/**
|
||
* Generate embeddings asynchronously for a list of text strings.
|
||
*
|
||
* This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
||
* ONNX inference work to a dedicated blocking thread pool via Tokio's
|
||
* `spawn_blocking`, keeping the async executor free.
|
||
*
|
||
* Returns one embedding vector per input text in the same order.
|
||
* @param texts - Vec of strings to embed (owned, sent to blocking thread)
|
||
*
|
||
* @param config - Embedding configuration specifying model, batch size, and normalization
|
||
*
|
||
* @throws - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
||
* - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
||
* or the blocking inference task panics
|
||
*/
|
||
export declare function embedTextsAsync(texts: Array<string>, config?: EmbeddingConfig | undefined | null): Promise<Array<Array<number>>>;
|
||
|
||
/**
|
||
* Extract content from a byte array.
|
||
*
|
||
* This is the main entry point for in-memory extraction. It performs the following steps:
|
||
* 1. Validate MIME type
|
||
* 2. Handle legacy format conversion if needed
|
||
* 3. Select appropriate extractor from registry
|
||
* 4. Extract content
|
||
* 5. Run post-processing pipeline
|
||
* @param content - The byte array to extract
|
||
*
|
||
* @param mime_type - MIME type of the content
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns An `ExtractionResult` containing the extracted content and metadata.
|
||
*
|
||
* @throws Returns `KreuzbergError::Validation` if MIME type is invalid.
|
||
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||
*/
|
||
export declare function extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
|
||
|
||
/**
|
||
* Synchronous wrapper for `extract_bytes`.
|
||
*
|
||
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
* a new runtime per call.
|
||
*
|
||
* With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
||
*/
|
||
export declare function extractBytesSync(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): ExtractionResult;
|
||
|
||
/**
|
||
* Extract content from a file.
|
||
*
|
||
* This is the main entry point for file-based extraction. It performs the following steps:
|
||
* 1. Check cache for existing result (if caching enabled)
|
||
* 2. Detect or validate MIME type
|
||
* 3. Select appropriate extractor from registry
|
||
* 4. Extract content
|
||
* 5. Run post-processing pipeline
|
||
* 6. Store result in cache (if caching enabled)
|
||
* @param path - Path to the file to extract
|
||
*
|
||
* @param mime_type - Optional MIME type override. If None, will be auto-detected
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns An `ExtractionResult` containing the extracted content and metadata.
|
||
*
|
||
* @throws Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors.
|
||
* Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
||
*/
|
||
export declare function extractFile(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): Promise<ExtractionResult>;
|
||
|
||
/**
|
||
* Synchronous wrapper for `extract_file`.
|
||
*
|
||
* This is a convenience function that blocks the current thread until extraction completes.
|
||
* For async code, use `extract_file` directly.
|
||
*
|
||
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
||
*
|
||
* This function is only available with the `tokio-runtime` feature. For WASM targets,
|
||
* use a truly synchronous extraction approach instead.
|
||
*/
|
||
export declare function extractFileSync(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): ExtractionResult;
|
||
|
||
/**
|
||
* Get an embedding preset by name.
|
||
*
|
||
* Returns `None` if no preset with the given name exists. Returns an owned
|
||
* clone so the value is safe to pass across FFI boundaries.
|
||
*/
|
||
export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null;
|
||
|
||
/**
|
||
* Get file extensions for a given MIME type.
|
||
*
|
||
* Returns all known file extensions that map to the specified MIME type.
|
||
* @param mime_type - The MIME type to look up
|
||
*
|
||
* @returns A vector of file extensions (without leading dot) for the MIME type.
|
||
*/
|
||
export declare function getExtensionsForMime(mimeType: string): Array<string>;
|
||
|
||
/**
|
||
* Hardware acceleration configuration for ONNX Runtime models.
|
||
*
|
||
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||
* for inference in layout detection and embedding generation.
|
||
*/
|
||
export interface AccelerationConfig {
|
||
/** Execution provider to use for ONNX inference. */
|
||
readonly provider?: ExecutionProviderType
|
||
/** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
|
||
readonly deviceId?: number
|
||
}
|
||
|
||
/** Types of inline text annotations. */
|
||
export type AnnotationKind =
|
||
| { annotation_type: 'bold' }
|
||
| { annotation_type: 'italic' }
|
||
| { annotation_type: 'underline' }
|
||
| { annotation_type: 'strikethrough' }
|
||
| { annotation_type: 'code' }
|
||
| { annotation_type: 'subscript' }
|
||
| { annotation_type: 'superscript' }
|
||
| { annotation_type: 'link'; url: string; title: string }
|
||
| { annotation_type: 'highlight' }
|
||
| { annotation_type: 'color'; value: string }
|
||
| { annotation_type: 'font_size'; value: string }
|
||
| { annotation_type: 'custom'; name: string; value: string }
|
||
|
||
/**
|
||
* A single file extracted from an archive.
|
||
*
|
||
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||
* enabled, each processable file produces its own full `ExtractionResult`.
|
||
*/
|
||
export interface ArchiveEntry {
|
||
/** Archive-relative file path (e.g. "folder/document.pdf"). */
|
||
readonly path: string
|
||
/** Detected MIME type of the file. */
|
||
readonly mimeType: string
|
||
/** Full extraction result for this file. */
|
||
readonly result: ExtractionResult
|
||
}
|
||
|
||
/**
|
||
* Archive (ZIP/TAR/7Z) metadata.
|
||
*
|
||
* Extracted from compressed archive files containing file lists and size information.
|
||
*/
|
||
export interface ArchiveMetadata {
|
||
/** Archive format ("ZIP", "TAR", "7Z", etc.) */
|
||
readonly format?: string
|
||
/** Total number of files in the archive */
|
||
readonly fileCount?: number
|
||
/** List of file paths within the archive */
|
||
readonly fileList?: Array<string>
|
||
/** Total uncompressed size in bytes */
|
||
readonly totalSize?: number
|
||
/** Compressed size in bytes (if available) */
|
||
readonly compressedSize?: number
|
||
}
|
||
|
||
/**
|
||
* Batch item for byte array extraction.
|
||
*
|
||
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||
* to represent a single item in a batch extraction job.
|
||
*/
|
||
export interface BatchBytesItem {
|
||
/** The content bytes to extract from */
|
||
readonly content: Uint8Array
|
||
/** MIME type of the content (e.g., "application/pdf", "text/html") */
|
||
readonly mimeType: string
|
||
/** Per-item configuration overrides (None uses batch-level defaults) */
|
||
readonly config?: FileExtractionConfig
|
||
}
|
||
|
||
/**
|
||
* Batch item for file extraction.
|
||
*
|
||
* Used with `batch_extract_files` and `batch_extract_files_sync`
|
||
* to represent a single file in a batch extraction job.
|
||
*/
|
||
export interface BatchFileItem {
|
||
/** Path to the file to extract from */
|
||
readonly path: string
|
||
/** Per-file configuration overrides (None uses batch-level defaults) */
|
||
readonly config?: FileExtractionConfig
|
||
}
|
||
|
||
/** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */
|
||
export interface BBox {
|
||
readonly x1: number
|
||
readonly y1: number
|
||
readonly x2: number
|
||
readonly y2: number
|
||
}
|
||
|
||
/** BibTeX bibliography metadata. */
|
||
export interface BibtexMetadata {
|
||
/** Number of entries in the bibliography. */
|
||
readonly entryCount?: number
|
||
readonly citationKeys?: Array<string>
|
||
readonly authors?: Array<string>
|
||
readonly yearRange?: YearRange
|
||
readonly entryTypes?: Record<string, number>
|
||
}
|
||
|
||
/** Types of block-level elements in Djot. */
|
||
export declare enum BlockType {
|
||
Paragraph = "paragraph",
|
||
Heading = "heading",
|
||
Blockquote = "blockquote",
|
||
CodeBlock = "code_block",
|
||
ListItem = "list_item",
|
||
OrderedList = "ordered_list",
|
||
BulletList = "bullet_list",
|
||
TaskList = "task_list",
|
||
DefinitionList = "definition_list",
|
||
DefinitionTerm = "definition_term",
|
||
DefinitionDescription = "definition_description",
|
||
Div = "div",
|
||
Section = "section",
|
||
ThematicBreak = "thematic_break",
|
||
RawBlock = "raw_block",
|
||
MathDisplay = "math_display",
|
||
}
|
||
|
||
/** Bounding box coordinates for element positioning. */
|
||
export interface BoundingBox {
|
||
/** Left x-coordinate */
|
||
readonly x0?: number
|
||
/** Bottom y-coordinate */
|
||
readonly y0?: number
|
||
/** Right x-coordinate */
|
||
readonly x1?: number
|
||
/** Top y-coordinate */
|
||
readonly y1?: number
|
||
}
|
||
|
||
export interface CacheStats {
|
||
readonly totalFiles: number
|
||
readonly totalSizeMb: number
|
||
readonly availableSpaceMb: number
|
||
readonly oldestFileAgeDays: number
|
||
readonly newestFileAgeDays: number
|
||
}
|
||
|
||
/**
|
||
* A single changed cell within a table.
|
||
*
|
||
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
* `crate::diff` re-exports this type verbatim.
|
||
*/
|
||
export interface CellChange {
|
||
/** Zero-based row index. */
|
||
readonly row: number
|
||
/** Zero-based column index. */
|
||
readonly col: number
|
||
/** Value before the change. */
|
||
readonly from: string
|
||
/** Value after the change. */
|
||
readonly to: string
|
||
}
|
||
|
||
/**
|
||
* A text chunk with optional embedding and metadata.
|
||
*
|
||
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||
* contains the text content, optional embedding vector (if embedding generation
|
||
* is configured), and metadata about its position in the document.
|
||
*/
|
||
export interface Chunk {
|
||
/** The text content of this chunk. */
|
||
readonly content: string
|
||
/**
|
||
* Semantic structural classification of this chunk.
|
||
*
|
||
* Assigned by the heuristic classifier based on content patterns and
|
||
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
|
||
*/
|
||
readonly chunkType: ChunkType
|
||
/**
|
||
* Optional embedding vector for this chunk.
|
||
*
|
||
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
||
* The dimensionality depends on the chosen embedding model.
|
||
*/
|
||
readonly embedding?: Array<number>
|
||
/** Metadata about this chunk's position and properties. */
|
||
readonly metadata: ChunkMetadata
|
||
}
|
||
|
||
/**
|
||
* Type of text chunker to use.
|
||
*
|
||
* # Variants
|
||
*
|
||
* * `Text` - Generic text splitter, splits on whitespace and punctuation
|
||
* * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
||
* * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
||
* * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
||
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
||
* lower = more splits). Without an embedding, falls back to a
|
||
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
||
* blank-line paragraphs) and merges groups into chunks capped at
|
||
* `max_characters` (default 1000). `topic_threshold` has no effect in the
|
||
* fallback path. For best results, pair with an embedding model.
|
||
*/
|
||
export declare enum ChunkerType {
|
||
Text = "text",
|
||
Markdown = "markdown",
|
||
Yaml = "yaml",
|
||
Semantic = "semantic",
|
||
}
|
||
|
||
/**
|
||
* Chunking configuration.
|
||
*
|
||
* Configures text chunking for document content, including chunk size,
|
||
* overlap, trimming behavior, and optional embeddings.
|
||
*
|
||
* Use `..Default::default()` when constructing to allow for future field additions:
|
||
* ```rust
|
||
* let config = ChunkingConfig {
|
||
* max_characters: 500,
|
||
* ..Default::default()
|
||
* };
|
||
* ```
|
||
*/
|
||
export interface ChunkingConfig {
|
||
/**
|
||
* Maximum size per chunk (in units determined by `sizing`).
|
||
*
|
||
* When `sizing` is `Characters` (default), this is the max character count.
|
||
* When using token-based sizing, this is the max token count.
|
||
*
|
||
* Default: 1000
|
||
*/
|
||
readonly maxCharacters?: number
|
||
/**
|
||
* Overlap between chunks (in units determined by `sizing`).
|
||
*
|
||
* Default: 200
|
||
*/
|
||
readonly overlap?: number
|
||
/**
|
||
* Whether to trim whitespace from chunk boundaries.
|
||
*
|
||
* Default: true
|
||
*/
|
||
readonly trim?: boolean
|
||
/**
|
||
* Type of chunker to use (Text or Markdown).
|
||
*
|
||
* Default: Text
|
||
*/
|
||
readonly chunkerType?: ChunkerType
|
||
/** Optional embedding configuration for chunk embeddings. */
|
||
readonly embedding?: EmbeddingConfig
|
||
/** Use a preset configuration (overrides individual settings if provided). */
|
||
readonly preset?: string
|
||
/**
|
||
* How to measure chunk size.
|
||
*
|
||
* Default: `Characters` (Unicode character count).
|
||
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
|
||
*/
|
||
readonly sizing?: ChunkSizing
|
||
/**
|
||
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
|
||
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
|
||
*
|
||
* This is useful for RAG pipelines where each chunk needs self-contained
|
||
* context about its position in the document structure.
|
||
*
|
||
* Default: `false`
|
||
*/
|
||
readonly prependHeadingContext?: boolean
|
||
/**
|
||
* Optional cosine similarity threshold for semantic topic boundary detection.
|
||
*
|
||
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
|
||
* provided. You almost never need to set this. When omitted, defaults to
|
||
* `0.75` which works well for most documents. Lower values detect more
|
||
* topic boundaries (more, smaller chunks); higher values detect fewer.
|
||
* Range: `0.0..=1.0`.
|
||
*/
|
||
readonly topicThreshold?: number
|
||
}
|
||
|
||
/** Metadata about a chunk's position in the original document. */
|
||
export interface ChunkMetadata {
|
||
/** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
|
||
readonly byteStart: number
|
||
/** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
|
||
readonly byteEnd: number
|
||
/**
|
||
* Number of tokens in this chunk (if available).
|
||
*
|
||
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
||
*/
|
||
readonly tokenCount?: number
|
||
/** Zero-based index of this chunk in the document. */
|
||
readonly chunkIndex: number
|
||
/** Total number of chunks in the document. */
|
||
readonly totalChunks: number
|
||
/**
|
||
* First page number this chunk spans (1-indexed).
|
||
*
|
||
* Only populated when page tracking is enabled in extraction configuration.
|
||
*/
|
||
readonly firstPage?: number
|
||
/**
|
||
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
||
*
|
||
* Only populated when page tracking is enabled in extraction configuration.
|
||
*/
|
||
readonly lastPage?: number
|
||
/**
|
||
* Heading context when using Markdown chunker.
|
||
*
|
||
* Contains the heading hierarchy this chunk falls under.
|
||
* Only populated when `ChunkerType::Markdown` is used.
|
||
*/
|
||
readonly headingContext?: HeadingContext
|
||
/**
|
||
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
||
*
|
||
* Contains zero-based indices into the top-level `images` collection for every
|
||
* image whose `page_number` falls within `[first_page, last_page]`.
|
||
* Empty when image extraction is disabled or the chunk spans no pages with images.
|
||
*/
|
||
readonly imageIndices: Array<number>
|
||
}
|
||
|
||
/**
|
||
* How chunk size is measured.
|
||
*
|
||
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||
* chunks are sized by token count according to the specified tokenizer.
|
||
*
|
||
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||
*/
|
||
export type ChunkSizing =
|
||
| { type: 'characters' }
|
||
| { type: 'tokenizer'; model: string; cacheDir: string }
|
||
|
||
/**
|
||
* Semantic structural classification of a text chunk.
|
||
*
|
||
* Assigned by the heuristic classifier in `chunking::classifier`.
|
||
* Defaults to `Unknown` when no rule matches.
|
||
* Designed to be extended in future versions without breaking changes.
|
||
*/
|
||
export declare enum ChunkType {
|
||
/** Section heading or document title. */
|
||
Heading = "heading",
|
||
/** Party list: names, addresses, and signatories. */
|
||
PartyList = "party_list",
|
||
/** Definition clause ("X means…", "X shall mean…"). */
|
||
Definitions = "definitions",
|
||
/** Operative clause containing legal/contractual action verbs. */
|
||
OperativeClause = "operative_clause",
|
||
/** Signature block with signatures, names, and dates. */
|
||
SignatureBlock = "signature_block",
|
||
/** Schedule, annex, appendix, or exhibit section. */
|
||
Schedule = "schedule",
|
||
/** Table-like content with aligned columns or repeated patterns. */
|
||
TableLike = "table_like",
|
||
/** Mathematical formula or equation. */
|
||
Formula = "formula",
|
||
/** Code block or preformatted content. */
|
||
CodeBlock = "code_block",
|
||
/** Embedded or referenced image content. */
|
||
Image = "image",
|
||
/** Organizational chart or hierarchy diagram. */
|
||
OrgChart = "org_chart",
|
||
/** Diagram, figure, or visual illustration. */
|
||
Diagram = "diagram",
|
||
/** Unclassified or mixed content. */
|
||
Unknown = "unknown",
|
||
}
|
||
|
||
/** Citation file metadata (RIS, PubMed, EndNote). */
|
||
export interface CitationMetadata {
|
||
readonly citationCount?: number
|
||
readonly format?: string
|
||
readonly authors?: Array<string>
|
||
readonly yearRange?: YearRange
|
||
readonly dois?: Array<string>
|
||
readonly keywords?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* Content rendering mode for code extraction.
|
||
*
|
||
* Controls how extracted code content is represented in the `content` field
|
||
* of `ExtractionResult`.
|
||
*/
|
||
export declare enum CodeContentMode {
|
||
/** Use TSLP semantic chunks as content (default). */
|
||
Chunks = "chunks",
|
||
/** Use raw source code as content. */
|
||
Raw = "raw",
|
||
/** Emit function/class headings + docstrings (no code bodies). */
|
||
Structure = "structure",
|
||
}
|
||
|
||
/**
|
||
* Cross-extractor content filtering configuration.
|
||
*
|
||
* Controls whether "furniture" content (headers, footers, page numbers,
|
||
* watermarks, repeating text) is included in or stripped from extraction
|
||
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||
* with format-specific implementation.
|
||
*
|
||
* When `None` on `ExtractionConfig`, each extractor uses its current
|
||
* default behavior unchanged.
|
||
*/
|
||
export interface ContentFilterConfig {
|
||
/**
|
||
* Include running headers in extraction output.
|
||
*
|
||
* - PDF: Disables top-margin furniture stripping and prevents the layout
|
||
* model from treating `PageHeader`-classified regions as furniture.
|
||
* - DOCX: Includes document headers in text output.
|
||
* - RTF/ODT: Headers already included; this is a no-op when true.
|
||
* - HTML/EPUB: Keeps `<header>` element content.
|
||
*
|
||
* Default: `false` (headers are stripped or excluded).
|
||
*/
|
||
readonly includeHeaders?: boolean
|
||
/**
|
||
* Include running footers in extraction output.
|
||
*
|
||
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
||
* model from treating `PageFooter`-classified regions as furniture.
|
||
* - DOCX: Includes document footers in text output.
|
||
* - RTF/ODT: Footers already included; this is a no-op when true.
|
||
* - HTML/EPUB: Keeps `<footer>` element content.
|
||
*
|
||
* Default: `false` (footers are stripped or excluded).
|
||
*/
|
||
readonly includeFooters?: boolean
|
||
/**
|
||
* Enable the heuristic cross-page repeating text detector.
|
||
*
|
||
* When `true` (default), text that repeats verbatim across a supermajority
|
||
* of pages is classified as furniture and stripped. Disable this if brand
|
||
* names or repeated headings are being incorrectly removed by the heuristic.
|
||
*
|
||
* Note: when a layout-detection model is active, the model may independently
|
||
* classify page-header / page-footer regions as furniture on a per-page basis.
|
||
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
|
||
* or both, in addition to disabling this flag.
|
||
*
|
||
* Primarily affects PDF extraction.
|
||
*
|
||
* Default: `true`.
|
||
*/
|
||
readonly stripRepeatingText?: boolean
|
||
/**
|
||
* Include watermark text in extraction output.
|
||
*
|
||
* - PDF: Keeps watermark artifacts and arXiv identifiers.
|
||
* - Other formats: No effect currently.
|
||
*
|
||
* Default: `false` (watermarks are stripped).
|
||
*/
|
||
readonly includeWatermarks?: boolean
|
||
}
|
||
|
||
/**
|
||
* Content layer classification for document nodes.
|
||
*
|
||
* Replaces separate body/furniture arrays with per-node granularity.
|
||
*/
|
||
export declare enum ContentLayer {
|
||
/** Main document body content. */
|
||
Body = "body",
|
||
/** Page/section header (running header). */
|
||
Header = "header",
|
||
/** Page/section footer (running footer). */
|
||
Footer = "footer",
|
||
/** Footnote content. */
|
||
Footnote = "footnote",
|
||
}
|
||
|
||
/** JATS contributor with role. */
|
||
export interface ContributorRole {
|
||
readonly name: string
|
||
readonly role?: string
|
||
}
|
||
|
||
/**
|
||
* Dublin Core metadata from docProps/core.xml
|
||
*
|
||
* Contains standard metadata fields defined by the Dublin Core standard
|
||
* and Office-specific extensions.
|
||
*/
|
||
export interface CoreProperties {
|
||
/** Document title */
|
||
readonly title?: string
|
||
/** Document subject/topic */
|
||
readonly subject?: string
|
||
/** Document creator/author */
|
||
readonly creator?: string
|
||
/** Keywords or tags */
|
||
readonly keywords?: string
|
||
/** Document description/abstract */
|
||
readonly description?: string
|
||
/** User who last modified the document */
|
||
readonly lastModifiedBy?: string
|
||
/** Revision number */
|
||
readonly revision?: string
|
||
/** Creation timestamp (ISO 8601) */
|
||
readonly created?: string
|
||
/** Last modification timestamp (ISO 8601) */
|
||
readonly modified?: string
|
||
/** Document category */
|
||
readonly category?: string
|
||
/** Content status (Draft, Final, etc.) */
|
||
readonly contentStatus?: string
|
||
/** Document language */
|
||
readonly language?: string
|
||
/** Unique identifier */
|
||
readonly identifier?: string
|
||
/** Document version */
|
||
readonly version?: string
|
||
/** Last print timestamp (ISO 8601) */
|
||
readonly lastPrinted?: string
|
||
}
|
||
|
||
/** CSV/TSV file metadata. */
|
||
export interface CsvMetadata {
|
||
readonly rowCount?: number
|
||
readonly columnCount?: number
|
||
readonly delimiter?: string
|
||
readonly hasHeader?: boolean
|
||
readonly columnTypes?: Array<string>
|
||
}
|
||
|
||
/** dBASE field information. */
|
||
export interface DbfFieldInfo {
|
||
readonly name: string
|
||
readonly fieldType: string
|
||
}
|
||
|
||
/** dBASE (DBF) file metadata. */
|
||
export interface DbfMetadata {
|
||
readonly recordCount?: number
|
||
readonly fieldCount?: number
|
||
readonly fields?: Array<DbfFieldInfo>
|
||
}
|
||
|
||
/** Page-level detection result containing all detections and page metadata. */
|
||
export interface DetectionResult {
|
||
readonly pageWidth: number
|
||
readonly pageHeight: number
|
||
readonly detections: Array<LayoutDetection>
|
||
}
|
||
|
||
/** MIME type detection response. */
|
||
export interface DetectResponse {
|
||
/** Detected MIME type */
|
||
readonly mimeType: string
|
||
/** Original filename (if provided) */
|
||
readonly filename?: string
|
||
}
|
||
|
||
/** A single contiguous hunk in a unified diff. */
|
||
export interface DiffHunk {
|
||
/** Starting line number in the old content (0-indexed). */
|
||
readonly fromLine: number
|
||
/** Number of lines from the old content in this hunk. */
|
||
readonly fromCount: number
|
||
/** Starting line number in the new content (0-indexed). */
|
||
readonly toLine: number
|
||
/** Number of lines from the new content in this hunk. */
|
||
readonly toCount: number
|
||
/** Lines that make up this hunk. */
|
||
readonly lines: Array<DiffLine>
|
||
}
|
||
|
||
/**
|
||
* A single line in a unified-diff hunk.
|
||
*
|
||
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
* `crate::diff` re-exports this type verbatim.
|
||
*/
|
||
export type DiffLine =
|
||
| { kind: 'context'; 0: string }
|
||
| { kind: 'added'; 0: string }
|
||
| { kind: 'removed'; 0: string }
|
||
|
||
/** Options controlling how two `ExtractionResult` values are compared. */
|
||
export interface DiffOptions {
|
||
/** Include metadata changes in the diff. Default: `true`. */
|
||
readonly includeMetadata?: boolean
|
||
/** Include embedded-children changes in the diff. Default: `true`. */
|
||
readonly includeEmbedded?: boolean
|
||
/**
|
||
* Truncate content to this many characters before diffing.
|
||
*
|
||
* Useful for very large documents where only the first N characters matter.
|
||
* `None` means no truncation.
|
||
*/
|
||
readonly maxContentChars?: number
|
||
}
|
||
|
||
/**
|
||
* Comprehensive Djot document structure with semantic preservation.
|
||
*
|
||
* This type captures the full richness of Djot markup, including:
|
||
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
||
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
||
* - Attributes (classes, IDs, key-value pairs)
|
||
* - Links, images, footnotes
|
||
* - Math expressions (inline and display)
|
||
* - Tables with full structure
|
||
*
|
||
* Available when the `djot` feature is enabled.
|
||
*/
|
||
export interface DjotContent {
|
||
/** Plain text representation for backwards compatibility */
|
||
readonly plainText: string
|
||
/** Structured block-level content */
|
||
readonly blocks: Array<FormattedBlock>
|
||
/** Metadata from YAML frontmatter */
|
||
readonly metadata: Metadata
|
||
/** Extracted tables as structured data */
|
||
readonly tables: Array<Table>
|
||
/** Extracted images with metadata */
|
||
readonly images: Array<DjotImage>
|
||
/** Extracted links with URLs */
|
||
readonly links: Array<DjotLink>
|
||
/** Footnote definitions */
|
||
readonly footnotes: Array<Footnote>
|
||
/** Attributes mapped by element identifier (if present) */
|
||
readonly attributes: Array<string>
|
||
}
|
||
|
||
/** Image element in Djot. */
|
||
export interface DjotImage {
|
||
/** Image source URL or path */
|
||
readonly src: string
|
||
/** Alternative text */
|
||
readonly alt: string
|
||
/** Optional title */
|
||
readonly title?: string
|
||
/** Element attributes */
|
||
readonly attributes?: string
|
||
}
|
||
|
||
/** Link element in Djot. */
|
||
export interface DjotLink {
|
||
/** Link URL */
|
||
readonly url: string
|
||
/** Link text content */
|
||
readonly text: string
|
||
/** Optional title */
|
||
readonly title?: string
|
||
/** Element attributes */
|
||
readonly attributes?: string
|
||
}
|
||
|
||
/**
|
||
* Trait for document extractor plugins.
|
||
*
|
||
* Implement this trait to add support for new document formats or to override
|
||
* built-in extraction behavior with custom logic.
|
||
*
|
||
* # Return Type
|
||
*
|
||
* Extractors return `InternalDocument`, a flat intermediate representation.
|
||
* The pipeline converts this into the public `ExtractionResult` via the
|
||
* derivation step.
|
||
*
|
||
* # Priority System
|
||
*
|
||
* When multiple extractors support the same MIME type, the registry selects
|
||
* the extractor with the highest priority value. Use this to:
|
||
* - Override built-in extractors (priority > 50)
|
||
* - Provide fallback extractors (priority < 50)
|
||
* - Implement specialized extractors for specific use cases
|
||
*
|
||
* Default priority is 50.
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
||
*/
|
||
export interface DocumentExtractor {
|
||
name(): string
|
||
/**
|
||
* Extract content from a byte array.
|
||
*
|
||
* This is the core extraction method that processes in-memory document data.
|
||
* @param content - Raw document bytes
|
||
*
|
||
* @param mime_type - MIME type of the document (already validated)
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||
* The pipeline will convert this into the public `ExtractionResult`.
|
||
*
|
||
* @throws - `KreuzbergError::Parsing` - Document parsing failed
|
||
* - `KreuzbergError::Validation` - Invalid document structure
|
||
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
|
||
* - `KreuzbergError::MissingDependency` - Required dependency not available
|
||
*/
|
||
extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
|
||
/**
|
||
* Extract content from a file.
|
||
*
|
||
* Default implementation reads the file and calls `extract_bytes`.
|
||
* Override for custom file handling, streaming, or memory optimizations.
|
||
* @param path - Path to the document file
|
||
*
|
||
* @param mime_type - MIME type of the document (already validated)
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||
*
|
||
* @throws Same as `extract_bytes`, plus file I/O errors.
|
||
*/
|
||
extractFile?(path: string, mimeType: string, config?: ExtractionConfig | undefined | null): Promise<string>
|
||
/**
|
||
* Get the list of MIME types supported by this extractor.
|
||
*
|
||
* Can include exact MIME types and prefix patterns:
|
||
* - Exact: `"application/pdf"`, `"text/plain"`
|
||
* - Prefix: `"image/*"` (matches any image type)
|
||
* @returns A slice of MIME type strings.
|
||
*/
|
||
supportedMimeTypes(): string
|
||
/**
|
||
* Get the priority of this extractor.
|
||
*
|
||
* Higher priority extractors are preferred when multiple extractors
|
||
* support the same MIME type.
|
||
*
|
||
* # Priority Guidelines
|
||
*
|
||
* - **0-25**: Fallback/low-quality extractors
|
||
* - **26-49**: Alternative extractors
|
||
* - **50**: Default priority (built-in extractors)
|
||
* - **51-75**: Premium/enhanced extractors
|
||
* - **76-100**: Specialized/high-priority extractors
|
||
* @returns Priority value (default: 50)
|
||
*/
|
||
priority?(): string
|
||
/**
|
||
* Optional: Check if this extractor can handle a specific file.
|
||
*
|
||
* Allows for more sophisticated detection beyond MIME types.
|
||
* Defaults to `true` (rely on MIME type matching).
|
||
* @param path - Path to the file to check
|
||
*
|
||
* @param mime_type - Detected MIME type
|
||
*
|
||
* @returns `true` if the extractor can handle this file, `false` otherwise.
|
||
*/
|
||
canHandle?(path: string, mimeType: string): string
|
||
}
|
||
|
||
/**
|
||
* A single node in the document tree.
|
||
*
|
||
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
||
* for tree structure, and metadata like page number, bounding box, and content layer.
|
||
*/
|
||
export interface DocumentNode {
|
||
/** Deterministic identifier (hash of content + position). */
|
||
readonly id: string
|
||
/** Node content — tagged enum, type-specific data only. */
|
||
readonly content: NodeContent
|
||
/** Parent node index (`None` = root-level node). */
|
||
readonly parent?: number
|
||
/** Child node indices in reading order. */
|
||
readonly children: Array<number>
|
||
/** Content layer classification. */
|
||
readonly contentLayer: ContentLayer
|
||
/** Page number where this node starts (1-indexed). */
|
||
readonly page?: number
|
||
/** Page number where this node ends (for multi-page tables/sections). */
|
||
readonly pageEnd?: number
|
||
/** Bounding box in document coordinates. */
|
||
readonly bbox?: BoundingBox
|
||
/**
|
||
* Inline annotations (formatting, links) on this node's text content.
|
||
*
|
||
* Only meaningful for text-carrying nodes; empty for containers.
|
||
*/
|
||
readonly annotations: Array<TextAnnotation>
|
||
/**
|
||
* Format-specific key-value attributes.
|
||
*
|
||
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
|
||
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
|
||
*/
|
||
readonly attributes?: Record<string, string>
|
||
}
|
||
|
||
/** A resolved relationship between two nodes in the document tree. */
|
||
export interface DocumentRelationship {
|
||
/** Source node index (the referencing node). */
|
||
readonly source: number
|
||
/** Target node index (the referenced node). */
|
||
readonly target: number
|
||
/** Semantic kind of the relationship. */
|
||
readonly kind: RelationshipKind
|
||
}
|
||
|
||
/**
|
||
* A single tracked change embedded in a document.
|
||
*
|
||
* Populated by per-format extractors that understand change-tracking metadata
|
||
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||
* extractor defaults to `ExtractionResult.revisions = None` until a
|
||
* format-specific implementation is added.
|
||
*/
|
||
export interface DocumentRevision {
|
||
/**
|
||
* Format-specific revision identifier.
|
||
*
|
||
* For DOCX this is the `w:id` attribute value on the change element
|
||
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
|
||
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
|
||
*/
|
||
readonly revisionId: string
|
||
/** Display name of the author who made this change, when available. */
|
||
readonly author?: string
|
||
/**
|
||
* ISO-8601 timestamp of the change, when available.
|
||
*
|
||
* Stored as a plain string so this type remains FFI-friendly and
|
||
* unconditionally available without the `chrono` optional dep.
|
||
* DOCX populates this from the `w:date` attribute (e.g.
|
||
* `"2024-03-15T10:30:00Z"`).
|
||
*/
|
||
readonly timestamp?: string
|
||
/** Semantic kind of this revision. */
|
||
readonly kind: RevisionKind
|
||
/**
|
||
* Best-effort document location for this revision.
|
||
*
|
||
* Resolution is format-dependent and may be `None` when the location
|
||
* cannot be determined (e.g. changes inside table cells before
|
||
* table-cell anchor support is added).
|
||
*/
|
||
readonly anchor?: RevisionAnchor
|
||
/** The content changes that make up this revision. */
|
||
readonly delta: RevisionDelta
|
||
}
|
||
|
||
/**
|
||
* Top-level structured document representation.
|
||
*
|
||
* A flat array of nodes with index-based parent/child references forming a tree.
|
||
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
||
* to iterate over top-level content by layer.
|
||
*
|
||
* # Validation
|
||
*
|
||
* Call `validate()` after construction to verify all node indices are in bounds
|
||
* and parent-child relationships are bidirectionally consistent.
|
||
*/
|
||
export interface DocumentStructure {
|
||
/** All nodes in document/reading order. */
|
||
readonly nodes?: Array<DocumentNode>
|
||
/**
|
||
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
|
||
*
|
||
* Allows renderers to apply format-aware heuristics when converting
|
||
* the document tree to output formats.
|
||
*/
|
||
readonly sourceFormat?: string
|
||
/**
|
||
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
|
||
*
|
||
* Populated during derivation from the internal document representation.
|
||
* Empty when no relationships are detected.
|
||
*/
|
||
readonly relationships?: Array<DocumentRelationship>
|
||
/**
|
||
* Sorted, deduplicated list of node type names present in this document.
|
||
*
|
||
* Each value is the snake_case `node_type` tag of the corresponding
|
||
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
|
||
*
|
||
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
|
||
* Empty until that method is called (internal construction paths call it
|
||
* at the end of derivation).
|
||
*/
|
||
readonly nodeTypes?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* Application properties from docProps/app.xml for DOCX
|
||
*
|
||
* Contains Word-specific document statistics and metadata.
|
||
*/
|
||
export interface DocxAppProperties {
|
||
/** Application name (e.g., "Microsoft Office Word") */
|
||
readonly application?: string
|
||
/** Application version */
|
||
readonly appVersion?: string
|
||
/** Template filename */
|
||
readonly template?: string
|
||
/** Total editing time in minutes */
|
||
readonly totalTime?: number
|
||
/** Number of pages */
|
||
readonly pages?: number
|
||
/** Number of words */
|
||
readonly words?: number
|
||
/** Number of characters (excluding spaces) */
|
||
readonly characters?: number
|
||
/** Number of characters (including spaces) */
|
||
readonly charactersWithSpaces?: number
|
||
/** Number of lines */
|
||
readonly lines?: number
|
||
/** Number of paragraphs */
|
||
readonly paragraphs?: number
|
||
/** Company name */
|
||
readonly company?: string
|
||
/** Document security level */
|
||
readonly docSecurity?: number
|
||
/** Scale crop flag */
|
||
readonly scaleCrop?: boolean
|
||
/** Links up to date flag */
|
||
readonly linksUpToDate?: boolean
|
||
/** Shared document flag */
|
||
readonly sharedDoc?: boolean
|
||
/** Hyperlinks changed flag */
|
||
readonly hyperlinksChanged?: boolean
|
||
}
|
||
|
||
/**
|
||
* Word document metadata.
|
||
*
|
||
* Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||
* Integrates with `office_metadata` module for core/app/custom properties.
|
||
*/
|
||
export interface DocxMetadata {
|
||
/**
|
||
* Core properties from docProps/core.xml (Dublin Core metadata)
|
||
*
|
||
* Contains title, creator, subject, keywords, dates, etc.
|
||
* Shared format across DOCX/PPTX/XLSX documents.
|
||
*/
|
||
readonly coreProperties?: CoreProperties
|
||
/**
|
||
* Application properties from docProps/app.xml (Word-specific statistics)
|
||
*
|
||
* Contains word count, page count, paragraph count, editing time, etc.
|
||
* DOCX-specific variant of Office application properties.
|
||
*/
|
||
readonly appProperties?: DocxAppProperties
|
||
/**
|
||
* Custom properties from docProps/custom.xml (user-defined properties)
|
||
*
|
||
* Contains key-value pairs defined by users or applications.
|
||
* Values can be strings, numbers, booleans, or dates.
|
||
*/
|
||
readonly customProperties?: Record<string, JsonValue>
|
||
}
|
||
|
||
/**
|
||
* Semantic element extracted from document.
|
||
*
|
||
* Represents a logical unit of content with semantic classification,
|
||
* unique identifier, and metadata for tracking origin and position.
|
||
*/
|
||
export interface Element {
|
||
/** Unique element identifier */
|
||
readonly elementId: string
|
||
/** Semantic type of this element */
|
||
readonly elementType: ElementType
|
||
/** Text content of the element */
|
||
readonly text: string
|
||
/** Metadata about the element */
|
||
readonly metadata: ElementMetadata
|
||
}
|
||
|
||
/** Metadata for a semantic element. */
|
||
export interface ElementMetadata {
|
||
/** Page number (1-indexed) */
|
||
readonly pageNumber?: number
|
||
/** Source filename or document name */
|
||
readonly filename?: string
|
||
/** Bounding box coordinates if available */
|
||
readonly coordinates?: BoundingBox
|
||
/** Position index in the element sequence */
|
||
readonly elementIndex?: number
|
||
/** Additional custom metadata */
|
||
readonly additional: Record<string, string>
|
||
}
|
||
|
||
/**
|
||
* Semantic element type classification.
|
||
*
|
||
* Categorizes text content into semantic units for downstream processing.
|
||
* Supports the element types commonly found in Unstructured documents.
|
||
*/
|
||
export declare enum ElementType {
|
||
/** Document title */
|
||
Title = "title",
|
||
/** Main narrative text body */
|
||
NarrativeText = "narrative_text",
|
||
/** Section heading */
|
||
Heading = "heading",
|
||
/** List item (bullet, numbered, etc.) */
|
||
ListItem = "list_item",
|
||
/** Table element */
|
||
Table = "table",
|
||
/** Image element */
|
||
Image = "image",
|
||
/** Page break marker */
|
||
PageBreak = "page_break",
|
||
/** Code block */
|
||
CodeBlock = "code_block",
|
||
/** Block quote */
|
||
BlockQuote = "block_quote",
|
||
/** Footer text */
|
||
Footer = "footer",
|
||
/** Header text */
|
||
Header = "header",
|
||
}
|
||
|
||
/**
|
||
* Email attachment representation.
|
||
*
|
||
* Contains metadata and optionally the content of an email attachment.
|
||
*/
|
||
export interface EmailAttachment {
|
||
/** Attachment name (from Content-Disposition header) */
|
||
readonly name?: string
|
||
/** Filename of the attachment */
|
||
readonly filename?: string
|
||
/** MIME type of the attachment */
|
||
readonly mimeType?: string
|
||
/** Size in bytes */
|
||
readonly size?: number
|
||
/** Whether this attachment is an image */
|
||
readonly isImage: boolean
|
||
/**
|
||
* Attachment data (if extracted).
|
||
* Uses `bytes::Bytes` for cheap cloning of large buffers.
|
||
*/
|
||
readonly data?: Uint8Array
|
||
}
|
||
|
||
/** Configuration for email extraction. */
|
||
export interface EmailConfig {
|
||
/**
|
||
* Windows codepage number to use when an MSG file contains no codepage property.
|
||
* Defaults to `None`, which falls back to windows-1252.
|
||
*
|
||
* If an unrecognized or invalid codepage number is supplied (including 0),
|
||
* the behavior silently falls back to windows-1252 — the same as when the
|
||
* MSG file itself contains an unrecognized codepage. No error or warning is
|
||
* emitted. Users should verify output when supplying unusual values.
|
||
*
|
||
* Common values:
|
||
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
|
||
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
|
||
* - 1252: Western European (default)
|
||
* - 1253: Greek
|
||
* - 1254: Turkish
|
||
* - 1255: Hebrew
|
||
* - 1256: Arabic
|
||
* - 932: Japanese (Shift-JIS)
|
||
* - 936: Simplified Chinese (GBK)
|
||
*/
|
||
readonly msgFallbackCodepage?: number
|
||
}
|
||
|
||
/**
|
||
* Email extraction result.
|
||
*
|
||
* Complete representation of an extracted email message (.eml or .msg)
|
||
* including headers, body content, and attachments.
|
||
*/
|
||
export interface EmailExtractionResult {
|
||
/** Email subject line */
|
||
readonly subject?: string
|
||
/** Sender email address */
|
||
readonly fromEmail?: string
|
||
/** Primary recipient email addresses */
|
||
readonly toEmails: Array<string>
|
||
/** CC recipient email addresses */
|
||
readonly ccEmails: Array<string>
|
||
/** BCC recipient email addresses */
|
||
readonly bccEmails: Array<string>
|
||
/** Email date/timestamp */
|
||
readonly date?: string
|
||
/** Message-ID header value */
|
||
readonly messageId?: string
|
||
/** Plain text version of the email body */
|
||
readonly plainText?: string
|
||
/** HTML version of the email body */
|
||
readonly htmlContent?: string
|
||
/** Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
|
||
readonly content: string
|
||
/** List of email attachments */
|
||
readonly attachments: Array<EmailAttachment>
|
||
/** Additional email headers and metadata */
|
||
readonly metadata: Record<string, string>
|
||
}
|
||
|
||
/**
|
||
* Email metadata extracted from .eml and .msg files.
|
||
*
|
||
* Includes sender/recipient information, message ID, and attachment list.
|
||
*/
|
||
export interface EmailMetadata {
|
||
/** Sender's email address */
|
||
readonly fromEmail?: string
|
||
/** Sender's display name */
|
||
readonly fromName?: string
|
||
/** Primary recipients */
|
||
readonly toEmails?: Array<string>
|
||
/** CC recipients */
|
||
readonly ccEmails?: Array<string>
|
||
/** BCC recipients */
|
||
readonly bccEmails?: Array<string>
|
||
/** Message-ID header value */
|
||
readonly messageId?: string
|
||
/** List of attachment filenames */
|
||
readonly attachments?: Array<string>
|
||
}
|
||
|
||
/** Changes to embedded archive children between two results. */
|
||
export interface EmbeddedChanges {
|
||
/** Children present in `b` but not in `a` (matched by `path`). */
|
||
readonly added: Array<ArchiveEntry>
|
||
/** Children present in `a` but not in `b` (matched by `path`). */
|
||
readonly removed: Array<ArchiveEntry>
|
||
/**
|
||
* Children present in both but with differing content (matched by `path`).
|
||
*
|
||
* Each entry holds the diff of the nested `ExtractionResult`.
|
||
*/
|
||
readonly changed: Array<EmbeddedDiff>
|
||
}
|
||
|
||
/** Diff for a single embedded archive entry that appears in both results. */
|
||
export interface EmbeddedDiff {
|
||
/** Archive-relative path identifying this entry. */
|
||
readonly path: string
|
||
/** The recursive diff of the entry's extraction result. */
|
||
readonly diff: ExtractionDiff
|
||
}
|
||
|
||
/** Embedded file descriptor extracted from the PDF name tree. */
|
||
export interface EmbeddedFile {
|
||
/** The filename as stored in the PDF name tree. */
|
||
readonly name: string
|
||
/** Raw file bytes from the embedded stream (already decompressed by lopdf). */
|
||
readonly data: Uint8Array
|
||
/**
|
||
* Compressed byte count of the original stream (before decompression).
|
||
*
|
||
* Used by callers to compute the decompression ratio and detect zip-bomb-style
|
||
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
|
||
*/
|
||
readonly compressedSize: number
|
||
/** MIME type if specified in the filespec, otherwise `None`. */
|
||
readonly mimeType?: string
|
||
}
|
||
|
||
/**
|
||
* Trait for in-process embedding backend plugins.
|
||
*
|
||
* Async to match the convention used by `OcrBackend`,
|
||
* `DocumentExtractor`, and `PostProcessor`.
|
||
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
|
||
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
|
||
* equivalent to satisfy the async signature.
|
||
*
|
||
* # Thread safety
|
||
*
|
||
* Backends must be `Send + Sync + 'static`. They are stored in
|
||
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
|
||
* pipeline. If the backend's underlying model isn't thread-safe, the backend
|
||
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
|
||
*
|
||
* # Contract
|
||
*
|
||
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
|
||
* `self.dimensions()`. The dispatcher in `embed_texts`
|
||
* validates this before returning to downstream consumers; a non-conforming
|
||
* backend surfaces as a `KreuzbergError::Validation`, not a panic.
|
||
* - `embed` may be called from any thread. Its future must be `Send`
|
||
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
|
||
* - `dimensions()` is called exactly once at registration, immediately after
|
||
* `initialize()` succeeds. The returned value is cached by the registry and
|
||
* used for all subsequent shape validation. Lazy-loading implementations can
|
||
* defer model loading into `initialize()` and report the real dimension
|
||
* afterwards. Later mutations of the backend's reported dimension are not
|
||
* observed by kreuzberg — implementations that need to change dimension
|
||
* must unregister and re-register.
|
||
* - `shutdown()` (inherited from `Plugin`) may be invoked
|
||
* concurrently with an in-flight `embed()` call. Implementations must
|
||
* tolerate this — e.g. by letting in-flight calls finish using resources
|
||
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
|
||
* shared state that isn't needed by `embed`.
|
||
*
|
||
* # Runtime
|
||
*
|
||
* The synchronous `embed_texts` entry uses
|
||
* [`tokio::task::block_in_place`] to await the trait's async `embed`, which
|
||
* requires a multi-thread tokio runtime. Callers running inside a
|
||
* `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
|
||
* or `tokio::runtime::Builder::new_current_thread()`) must use
|
||
* `embed_texts_async` instead, which awaits directly without
|
||
* `block_in_place`.
|
||
*/
|
||
export interface EmbeddingBackend {
|
||
name(): string
|
||
/**
|
||
* Embedding vector dimension. Must be `> 0` and must match the length of
|
||
* every vector returned by `embed`.
|
||
*/
|
||
dimensions(): string
|
||
/**
|
||
* Embed a batch of texts, returning one vector per input in order.
|
||
* @throws Implementations should return `Plugin` for
|
||
* backend-specific failures. The dispatcher layers its own validation
|
||
* (length, per-vector dimension) on top.
|
||
*/
|
||
embed(texts: Array<string>): Promise<string>
|
||
}
|
||
|
||
/**
|
||
* Embedding configuration for text chunks.
|
||
*
|
||
* Configures embedding generation using ONNX models via the vendored embedding engine.
|
||
* Requires the `embeddings` feature to be enabled.
|
||
*/
|
||
export interface EmbeddingConfig {
|
||
/** The embedding model to use (defaults to "balanced" preset if not specified) */
|
||
readonly model?: EmbeddingModelType
|
||
/** Whether to normalize embedding vectors (recommended for cosine similarity) */
|
||
readonly normalize?: boolean
|
||
/** Batch size for embedding generation */
|
||
readonly batchSize?: number
|
||
/** Show model download progress */
|
||
readonly showDownloadProgress?: boolean
|
||
/**
|
||
* Custom cache directory for model files
|
||
*
|
||
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
|
||
* Allows full customization of model download location.
|
||
*/
|
||
readonly cacheDir?: string
|
||
/**
|
||
* Hardware acceleration for the embedding ONNX model.
|
||
*
|
||
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
||
* is used for inference. Defaults to `None` (auto-select per platform).
|
||
*/
|
||
readonly acceleration?: AccelerationConfig
|
||
/**
|
||
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
|
||
* using [`EmbeddingModelType::Plugin`].
|
||
*
|
||
* Applies only to the in-process plugin path — protects against hung
|
||
* host-language backends (e.g. a Python callback deadlocked on the GIL,
|
||
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
|
||
* returns `Plugin` instead of blocking forever.
|
||
*
|
||
* `None` disables the timeout. The default (60 seconds) is conservative
|
||
* for common in-process inference; increase for large batches on slow
|
||
* hardware.
|
||
*/
|
||
readonly maxEmbedDurationSecs?: number
|
||
}
|
||
|
||
/** Embedding model types supported by Kreuzberg. */
|
||
export type EmbeddingModelType =
|
||
| { type: 'preset'; name: string }
|
||
| { type: 'custom'; modelId: string; dimensions: number }
|
||
| { type: 'llm'; llm: LlmConfig }
|
||
| { type: 'plugin'; name: string }
|
||
|
||
/**
|
||
* Preset configurations for common RAG use cases.
|
||
*
|
||
* Each preset combines chunk size, overlap, and embedding model
|
||
* to provide an optimized configuration for specific scenarios.
|
||
*
|
||
* All string fields are owned `String` for FFI compatibility — instances
|
||
* are safe to clone and pass across language boundaries.
|
||
*/
|
||
export interface EmbeddingPreset {
|
||
readonly name: string
|
||
readonly chunkSize: number
|
||
readonly overlap: number
|
||
/** HuggingFace repository name for the model. */
|
||
readonly modelRepo: string
|
||
/** Pooling strategy: "cls" or "mean". */
|
||
readonly pooling: string
|
||
/** Path to the ONNX model file within the repo. */
|
||
readonly modelFile: string
|
||
readonly dimensions: number
|
||
readonly description: string
|
||
}
|
||
|
||
/** EPUB metadata (Dublin Core extensions). */
|
||
export interface EpubMetadata {
|
||
readonly coverage?: string
|
||
readonly dcFormat?: string
|
||
readonly relation?: string
|
||
readonly source?: string
|
||
readonly dcType?: string
|
||
readonly coverImage?: string
|
||
}
|
||
|
||
/** Error metadata (for batch operations). */
|
||
export interface ErrorMetadata {
|
||
readonly errorType: string
|
||
readonly message: string
|
||
}
|
||
|
||
/**
|
||
* Excel/spreadsheet format metadata.
|
||
*
|
||
* Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
||
* discriminant. Sheet count and sheet names are stored inside this struct.
|
||
*/
|
||
export interface ExcelMetadata {
|
||
/** Number of sheets in the workbook. */
|
||
readonly sheetCount?: number
|
||
/** Names of all sheets in the workbook. */
|
||
readonly sheetNames?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* Single Excel worksheet.
|
||
*
|
||
* Represents one sheet from an Excel workbook with its content
|
||
* converted to Markdown format and dimensional statistics.
|
||
*/
|
||
export interface ExcelSheet {
|
||
/** Sheet name as it appears in Excel */
|
||
readonly name: string
|
||
/** Sheet content converted to Markdown tables */
|
||
readonly markdown: string
|
||
/** Number of rows */
|
||
readonly rowCount: number
|
||
/** Number of columns */
|
||
readonly colCount: number
|
||
/** Total number of non-empty cells */
|
||
readonly cellCount: number
|
||
/**
|
||
* Pre-extracted table cells (2D vector of cell values)
|
||
* Populated during markdown generation to avoid re-parsing markdown.
|
||
* None for empty sheets.
|
||
*/
|
||
readonly tableCells?: Array<Array<string>>
|
||
}
|
||
|
||
/**
|
||
* Excel workbook representation.
|
||
*
|
||
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
||
* extracted content and metadata.
|
||
*/
|
||
export interface ExcelWorkbook {
|
||
/** All sheets in the workbook */
|
||
readonly sheets: Array<ExcelSheet>
|
||
/** Workbook-level metadata (author, creation date, etc.) */
|
||
readonly metadata: Record<string, string>
|
||
/**
|
||
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
|
||
*
|
||
* Populated for legacy shared-workbook `.xlsx` files that contain the
|
||
* `xl/revisions/` directory. Each `<header>` element maps to one
|
||
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
|
||
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
|
||
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
|
||
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
|
||
*/
|
||
readonly revisions?: Array<DocumentRevision>
|
||
}
|
||
|
||
/**
|
||
* ONNX Runtime execution provider type.
|
||
*
|
||
* Determines which hardware backend is used for model inference.
|
||
* `Auto` (default) selects the best available provider per platform.
|
||
*/
|
||
export declare enum ExecutionProviderType {
|
||
/** Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere. */
|
||
Auto = "auto",
|
||
/** CPU execution provider (always available). */
|
||
Cpu = "cpu",
|
||
/** Apple CoreML (macOS/iOS Neural Engine + GPU). */
|
||
CoreMl = "coreml",
|
||
/** NVIDIA CUDA GPU acceleration. */
|
||
Cuda = "cuda",
|
||
/** NVIDIA TensorRT (optimized CUDA inference). */
|
||
TensorRt = "tensorrt",
|
||
}
|
||
|
||
/**
|
||
* Extracted image from a document.
|
||
*
|
||
* Contains raw image data, metadata, and optional nested OCR results.
|
||
* Raw bytes allow cross-language compatibility - users can convert to
|
||
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||
*/
|
||
export interface ExtractedImage {
|
||
/**
|
||
* Raw image data (PNG, JPEG, WebP, etc. bytes).
|
||
* Uses `bytes::Bytes` for cheap cloning of large buffers.
|
||
*/
|
||
readonly data: Uint8Array
|
||
/**
|
||
* Image format (e.g., "jpeg", "png", "webp")
|
||
* Uses Cow<'static, str> to avoid allocation for static literals.
|
||
*/
|
||
readonly format: string
|
||
/** Zero-indexed position of this image in the document/page */
|
||
readonly imageIndex: number
|
||
/** Page/slide number where image was found (1-indexed) */
|
||
readonly pageNumber?: number
|
||
/** Image width in pixels */
|
||
readonly width?: number
|
||
/** Image height in pixels */
|
||
readonly height?: number
|
||
/** Colorspace information (e.g., "RGB", "CMYK", "Gray") */
|
||
readonly colorspace?: string
|
||
/** Bits per color component (e.g., 8, 16) */
|
||
readonly bitsPerComponent?: number
|
||
/** Whether this image is a mask image */
|
||
readonly isMask: boolean
|
||
/** Optional description of the image */
|
||
readonly description?: string
|
||
/**
|
||
* Nested OCR extraction result (if image was OCRed)
|
||
*
|
||
* When OCR is performed on this image, the result is embedded here
|
||
* rather than in a separate collection, making the relationship explicit.
|
||
*/
|
||
readonly ocrResult?: ExtractionResult
|
||
/**
|
||
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
||
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
|
||
*/
|
||
readonly boundingBox?: BoundingBox
|
||
/**
|
||
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
||
* Used for rendering image references when the binary data is not extracted.
|
||
*/
|
||
readonly sourcePath?: string
|
||
/**
|
||
* Heuristic classification of what this image likely depicts.
|
||
* `None` if classification was disabled or inconclusive.
|
||
*/
|
||
readonly imageKind?: ImageKind
|
||
/** Confidence score for `image_kind`, in the range 0.0 to 1.0. */
|
||
readonly kindConfidence?: number
|
||
/**
|
||
* Identifier shared across images that form a single logical figure
|
||
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
|
||
*/
|
||
readonly clusterId?: number
|
||
}
|
||
|
||
/**
|
||
* A URI extracted from a document.
|
||
*
|
||
* Represents any link, reference, or resource pointer found during extraction.
|
||
* The `kind` field classifies the URI semantically, while `label` carries
|
||
* optional human-readable display text.
|
||
*/
|
||
export interface ExtractedUri {
|
||
/** The URL or path string. */
|
||
readonly url: string
|
||
/** Optional display text / label for the link. */
|
||
readonly label?: string
|
||
/** Optional page number where the URI was found (1-indexed). */
|
||
readonly page?: number
|
||
/** Semantic classification of the URI. */
|
||
readonly kind: UriKind
|
||
}
|
||
|
||
/**
|
||
* Main extraction configuration.
|
||
*
|
||
* This struct contains all configuration options for the extraction process.
|
||
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||
*/
|
||
export interface ExtractionConfig {
|
||
/** Enable caching of extraction results */
|
||
readonly useCache?: boolean
|
||
/** Enable quality post-processing */
|
||
readonly enableQualityProcessing?: boolean
|
||
/** OCR configuration (None = OCR disabled) */
|
||
readonly ocr?: OcrConfig
|
||
/** Force OCR even for searchable PDFs */
|
||
readonly forceOcr?: boolean
|
||
/**
|
||
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
||
*
|
||
* When set, only the listed pages are OCR'd regardless of text layer quality.
|
||
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
|
||
* Only applies to PDF documents. Duplicates are automatically deduplicated.
|
||
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
|
||
*/
|
||
readonly forceOcrPages?: Array<number>
|
||
/**
|
||
* Disable OCR entirely, even for images.
|
||
*
|
||
* When `true`, OCR is skipped for all document types. Images return metadata
|
||
* only (dimensions, format, EXIF) without text extraction. PDFs use only
|
||
* native text extraction without OCR fallback.
|
||
*
|
||
* Cannot be `true` simultaneously with `force_ocr`.
|
||
*
|
||
* *Added in v4.7.0.*
|
||
*/
|
||
readonly disableOcr?: boolean
|
||
/** Text chunking configuration (None = chunking disabled) */
|
||
readonly chunking?: ChunkingConfig
|
||
/**
|
||
* Content filtering configuration (None = use extractor defaults).
|
||
*
|
||
* Controls whether document "furniture" (headers, footers, watermarks,
|
||
* repeating text) is included in or stripped from extraction results.
|
||
* See [`ContentFilterConfig`] for per-field documentation.
|
||
*/
|
||
readonly contentFilter?: ContentFilterConfig
|
||
/** Image extraction configuration (None = no image extraction) */
|
||
readonly images?: ImageExtractionConfig
|
||
/** PDF-specific options (None = use defaults) */
|
||
readonly pdfOptions?: PdfConfig
|
||
/** Token reduction configuration (None = no token reduction) */
|
||
readonly tokenReduction?: TokenReductionOptions
|
||
/** Language detection configuration (None = no language detection) */
|
||
readonly languageDetection?: LanguageDetectionConfig
|
||
/** Page extraction configuration (None = no page tracking) */
|
||
readonly pages?: PageConfig
|
||
/** Keyword extraction configuration (None = no keyword extraction) */
|
||
readonly keywords?: KeywordConfig
|
||
/** Post-processor configuration (None = use defaults) */
|
||
readonly postprocessor?: PostProcessorConfig
|
||
/**
|
||
* HTML to Markdown conversion options (None = use defaults)
|
||
*
|
||
* Configure how HTML documents are converted to Markdown, including heading styles,
|
||
* list formatting, code block styles, and preprocessing options.
|
||
*/
|
||
readonly htmlOptions?: string
|
||
/**
|
||
* Styled HTML output configuration.
|
||
*
|
||
* When set alongside `output_format = OutputFormat::Html`, the extraction
|
||
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
|
||
* which emits stable `kb-*` CSS class hooks on every structural element
|
||
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
|
||
*
|
||
* When `None`, the existing plain comrak-based HTML renderer is used.
|
||
*/
|
||
readonly htmlOutput?: HtmlOutputConfig
|
||
/**
|
||
* Default per-file timeout in seconds for batch extraction.
|
||
*
|
||
* When set, each file in a batch will be canceled after this duration
|
||
* unless overridden by [`FileExtractionConfig::timeout_secs`].
|
||
*
|
||
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
|
||
* nested archives, documents with millions of cells) from running
|
||
* indefinitely and exhausting caller resources. Set to `None` to
|
||
* disable the timeout for trusted input or long-running workloads.
|
||
*/
|
||
readonly extractionTimeoutSecs?: number
|
||
/**
|
||
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
|
||
*
|
||
* Limits parallelism to prevent resource exhaustion when processing
|
||
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
||
*/
|
||
readonly maxConcurrentExtractions?: number
|
||
/**
|
||
* Result structure format
|
||
*
|
||
* Controls whether results are returned in unified format (default) with all
|
||
* content in the `content` field, or element-based format with semantic
|
||
* elements (for Unstructured-compatible output).
|
||
*/
|
||
readonly resultFormat?: ResultFormat
|
||
/**
|
||
* Security limits for archive extraction.
|
||
*
|
||
* Controls maximum archive size, compression ratio, file count, and other
|
||
* security thresholds to prevent decompression bomb attacks. Also caps
|
||
* nesting depth, iteration count, entity / token length, total
|
||
* content size, and table cell count for every extraction path that
|
||
* ingests user-controlled bytes.
|
||
* When `None`, default limits are used.
|
||
*/
|
||
readonly securityLimits?: SecurityLimits
|
||
/**
|
||
* Maximum uncompressed size in bytes for a single embedded file before
|
||
* recursive extraction is attempted (default: 50 MiB).
|
||
*
|
||
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
||
* to email attachments processed via recursive extraction. Files that
|
||
* exceed this limit are skipped with a `ProcessingWarning` rather than
|
||
* passed to the extraction pipeline, preventing a single oversized
|
||
* embedded object from consuming unbounded memory or time.
|
||
*
|
||
* Set to `None` to disable the per-embedded-file cap (falls back to
|
||
* `security_limits.max_archive_size` as the only guard).
|
||
*/
|
||
readonly maxEmbeddedFileBytes?: number
|
||
/**
|
||
* Content text format (default: Plain).
|
||
*
|
||
* Controls the format of the extracted content:
|
||
* - `Plain`: Raw extracted text (default)
|
||
* - `Markdown`: Markdown formatted output
|
||
* - `Djot`: Djot markup format (requires djot feature)
|
||
* - `Html`: HTML formatted output
|
||
*
|
||
* When set to a structured format, extraction results will include
|
||
* formatted output. The `formatted_content` field may be populated
|
||
* when format conversion is applied.
|
||
*/
|
||
readonly outputFormat?: OutputFormat
|
||
/**
|
||
* Layout detection configuration (None = layout detection disabled).
|
||
*
|
||
* When set, PDF pages and images are analyzed for document structure
|
||
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
||
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
||
* in the markdown pipeline. For images, per-region OCR is performed with
|
||
* markdown formatting based on detected layout classes.
|
||
* Requires the `layout-detection` feature to run inference; the field is
|
||
* present whenever the `layout-types` feature is active (which includes
|
||
* `layout-detection` as well as the no-ORT target groups).
|
||
*/
|
||
readonly layout?: LayoutDetectionConfig
|
||
/**
|
||
* Run layout detection on the non-OCR PDF markdown path.
|
||
*
|
||
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
|
||
* table, list, and figure detection in the structure pipeline that would
|
||
* otherwise rely on font-clustering heuristics alone. Significantly
|
||
* improves SF1 (structural F1) at the cost of inference latency
|
||
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
|
||
* Requires the `layout-detection` feature.
|
||
*/
|
||
readonly useLayoutForMarkdown?: boolean
|
||
/**
|
||
* Enable structured document tree output.
|
||
*
|
||
* When true, populates the `document` field on `ExtractionResult` with a
|
||
* hierarchical `DocumentStructure` containing heading-driven section nesting,
|
||
* table grids, content layer classification, and inline annotations.
|
||
*
|
||
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
||
*/
|
||
readonly includeDocumentStructure?: boolean
|
||
/**
|
||
* Hardware acceleration configuration for ONNX Runtime models.
|
||
*
|
||
* Controls execution provider selection for layout detection and embedding
|
||
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
|
||
* Linux, CPU on Windows).
|
||
*/
|
||
readonly acceleration?: AccelerationConfig
|
||
/**
|
||
* Cache namespace for tenant isolation.
|
||
*
|
||
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
|
||
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
||
* Different namespaces have isolated cache spaces on the same filesystem.
|
||
*/
|
||
readonly cacheNamespace?: string
|
||
/**
|
||
* Per-request cache TTL in seconds.
|
||
*
|
||
* Overrides the global `max_age_days` for this specific extraction.
|
||
* When `0`, caching is completely skipped (no read or write).
|
||
* When `None`, the global TTL applies.
|
||
*/
|
||
readonly cacheTtlSecs?: number
|
||
/**
|
||
* Email extraction configuration (None = use defaults).
|
||
*
|
||
* Currently supports configuring the fallback codepage for MSG files
|
||
* that do not specify one. See `EmailConfig` for details.
|
||
*/
|
||
readonly email?: EmailConfig
|
||
/**
|
||
* Concurrency limits for constrained environments (None = use defaults).
|
||
*
|
||
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
||
* (when `max_concurrent_extractions` is unset) the batch concurrency
|
||
* semaphore. See `ConcurrencyConfig` for details.
|
||
*/
|
||
readonly concurrency?: string
|
||
/**
|
||
* Maximum recursion depth for archive extraction (default: 3).
|
||
* Set to 0 to disable recursive extraction (legacy behavior).
|
||
*/
|
||
readonly maxArchiveDepth?: number
|
||
/**
|
||
* Tree-sitter language pack configuration (None = tree-sitter disabled).
|
||
*
|
||
* When set, enables code file extraction using tree-sitter parsers.
|
||
* Controls grammar download behavior and code analysis options.
|
||
*/
|
||
readonly treeSitter?: TreeSitterConfig
|
||
/**
|
||
* Structured extraction via LLM (None = disabled).
|
||
*
|
||
* When set, the extracted document content is sent to an LLM with the
|
||
* provided JSON schema. The structured response is stored in
|
||
* `ExtractionResult::structured_output`.
|
||
*/
|
||
readonly structuredExtraction?: StructuredExtractionConfig
|
||
/**
|
||
* Cancellation token for this extraction (None = no external cancellation).
|
||
*
|
||
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
|
||
* from another thread / task to abort the extraction in progress. The extractor
|
||
* checks the token at safe checkpoints (before lock acquisition, between pages,
|
||
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
|
||
*
|
||
* The field is excluded from serialization because `CancellationToken` is a
|
||
* runtime handle, not a configuration value.
|
||
*/
|
||
readonly cancelToken?: string
|
||
}
|
||
|
||
/** The complete diff between two `ExtractionResult` values. */
|
||
export interface ExtractionDiff {
|
||
/**
|
||
* Unified-diff hunks for the `content` field.
|
||
*
|
||
* Empty when the content is identical.
|
||
*/
|
||
readonly contentDiff: Array<DiffHunk>
|
||
/** Tables present in `b` but not in `a` (by index position, excess right-side tables). */
|
||
readonly tablesAdded: Array<Table>
|
||
/** Tables present in `a` but not in `b` (by index position, excess left-side tables). */
|
||
readonly tablesRemoved: Array<Table>
|
||
/** Cell-level changes for table pairs that share the same index and dimensions. */
|
||
readonly tablesChanged: Array<TableDiff>
|
||
/**
|
||
* Metadata difference, encoded as a JSON object with three top-level keys:
|
||
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
|
||
* but not `b`), and `changed` (keys whose values differ — each entry is
|
||
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
|
||
*
|
||
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
|
||
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
|
||
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
|
||
* preferred json-patch impl directly.
|
||
*/
|
||
readonly metadataChanged: JsonValue
|
||
/** Changes to embedded archive children. */
|
||
readonly embeddedChanges: EmbeddedChanges
|
||
}
|
||
|
||
/** How the extracted text was produced. */
|
||
export declare enum ExtractionMethod {
|
||
Native = "native",
|
||
Ocr = "ocr",
|
||
Mixed = "mixed",
|
||
}
|
||
|
||
/**
|
||
* General extraction result used by the core extraction API.
|
||
*
|
||
* This is the main result type returned by all extraction functions.
|
||
*/
|
||
export interface ExtractionResult {
|
||
readonly content?: string
|
||
readonly mimeType?: string
|
||
readonly metadata?: Metadata
|
||
/**
|
||
* Extraction strategy used to produce the returned text.
|
||
*
|
||
* Populated when the extractor can reliably distinguish native text extraction,
|
||
* OCR-only extraction, or mixed native/OCR output.
|
||
*/
|
||
readonly extractionMethod?: ExtractionMethod
|
||
readonly tables?: Array<Table>
|
||
readonly detectedLanguages?: Array<string>
|
||
/**
|
||
* Text chunks when chunking is enabled.
|
||
*
|
||
* When chunking configuration is provided, the content is split into
|
||
* overlapping chunks for efficient processing. Each chunk contains the text,
|
||
* optional embeddings (if enabled), and metadata about its position.
|
||
*/
|
||
readonly chunks?: Array<Chunk>
|
||
/**
|
||
* Extracted images from the document.
|
||
*
|
||
* When image extraction is enabled via `ImageExtractionConfig`, this field
|
||
* contains all images found in the document with their raw data and metadata.
|
||
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
||
*/
|
||
readonly images?: Array<ExtractedImage>
|
||
/**
|
||
* Per-page content when page extraction is enabled.
|
||
*
|
||
* When page extraction is configured, the document is split into per-page content
|
||
* with tables and images mapped to their respective pages.
|
||
*/
|
||
readonly pages?: Array<PageContent>
|
||
/**
|
||
* Semantic elements when element-based result format is enabled.
|
||
*
|
||
* When result_format is set to ElementBased, this field contains semantic
|
||
* elements with type classification, unique identifiers, and metadata for
|
||
* Unstructured-compatible element-based processing.
|
||
*/
|
||
readonly elements?: Array<Element>
|
||
/**
|
||
* Rich Djot content structure (when extracting Djot documents).
|
||
*
|
||
* When extracting Djot documents with structured extraction enabled,
|
||
* this field contains the full semantic structure including:
|
||
* - Block-level elements with nesting
|
||
* - Inline formatting with attributes
|
||
* - Links, images, footnotes
|
||
* - Math expressions
|
||
* - Complete attribute information
|
||
*
|
||
* The `content` field still contains plain text for backward compatibility.
|
||
*
|
||
* Always `None` for non-Djot documents.
|
||
*/
|
||
readonly djotContent?: DjotContent
|
||
/**
|
||
* OCR elements with full spatial and confidence metadata.
|
||
*
|
||
* When OCR is performed with element extraction enabled, this field contains
|
||
* the structured representation of detected text including:
|
||
* - Bounding geometry (rectangles or quadrilaterals)
|
||
* - Confidence scores (detection and recognition)
|
||
* - Rotation information
|
||
* - Hierarchical relationships (Tesseract only)
|
||
*
|
||
* This field preserves all metadata that would otherwise be lost when
|
||
* converting to plain text or markdown output formats.
|
||
*
|
||
* Only populated when `OcrElementConfig.include_elements` is true.
|
||
*/
|
||
readonly ocrElements?: Array<OcrElement>
|
||
/**
|
||
* Structured document tree (when document structure extraction is enabled).
|
||
*
|
||
* When `include_document_structure` is true in `ExtractionConfig`, this field
|
||
* contains the full hierarchical representation of the document including:
|
||
* - Heading-driven section nesting
|
||
* - Table grids with cell-level metadata
|
||
* - Content layer classification (body, header, footer, footnote)
|
||
* - Inline text annotations (formatting, links)
|
||
* - Bounding boxes and page numbers
|
||
*
|
||
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
||
*/
|
||
readonly document?: DocumentStructure
|
||
/**
|
||
* Extracted keywords when keyword extraction is enabled.
|
||
*
|
||
* When keyword extraction (RAKE or YAKE) is configured, this field contains
|
||
* the extracted keywords with scores, algorithm info, and position data.
|
||
* Previously stored in `metadata.additional["keywords"]`.
|
||
*/
|
||
readonly extractedKeywords?: Array<Keyword>
|
||
/**
|
||
* Document quality score from quality analysis.
|
||
*
|
||
* A value between 0.0 and 1.0 indicating the overall text quality.
|
||
* Previously stored in `metadata.additional["quality_score"]`.
|
||
*/
|
||
readonly qualityScore?: number
|
||
/**
|
||
* Non-fatal warnings collected during processing pipeline stages.
|
||
*
|
||
* Captures errors from optional pipeline features (embedding, chunking,
|
||
* language detection, output formatting) that don't prevent extraction
|
||
* but may indicate degraded results.
|
||
* Previously stored as individual keys in `metadata.additional`.
|
||
*/
|
||
readonly processingWarnings?: Array<ProcessingWarning>
|
||
/**
|
||
* PDF annotations extracted from the document.
|
||
*
|
||
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
|
||
* this field contains text notes, highlights, links, stamps, and other
|
||
* annotations found in PDF documents.
|
||
*/
|
||
readonly annotations?: Array<PdfAnnotation>
|
||
/**
|
||
* Nested extraction results from archive contents.
|
||
*
|
||
* When extracting archives, each processable file inside produces its own
|
||
* full extraction result. Set to `None` for non-archive formats.
|
||
* Use `max_archive_depth` in config to control recursion depth.
|
||
*/
|
||
readonly children?: Array<ArchiveEntry>
|
||
/**
|
||
* URIs/links discovered during document extraction.
|
||
*
|
||
* Contains hyperlinks, image references, citations, email addresses, and
|
||
* other URI-like references found in the document. Always extracted when
|
||
* present in the source document.
|
||
*/
|
||
readonly uris?: Array<ExtractedUri>
|
||
/**
|
||
* Tracked changes embedded in the source document.
|
||
*
|
||
* Populated by per-format extractors that understand change-tracking
|
||
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
|
||
* …). Every extractor defaults to `None` until its format-specific
|
||
* implementation is added. Extractors that do populate this field follow
|
||
* the "accepted-changes" convention: inserted text is present in
|
||
* `content`, deleted text is absent — the revision list is the separate
|
||
* audit trail.
|
||
*/
|
||
readonly revisions?: Array<DocumentRevision>
|
||
/**
|
||
* Structured extraction output from LLM-based JSON schema extraction.
|
||
*
|
||
* When `structured_extraction` is configured in `ExtractionConfig`, the
|
||
* extracted document content is sent to a VLM with the provided JSON schema.
|
||
* The response is parsed and stored here as a JSON value matching the schema.
|
||
*/
|
||
readonly structuredOutput?: JsonValue
|
||
/**
|
||
* Code intelligence results from tree-sitter analysis.
|
||
*
|
||
* Populated when extracting source code files with the `tree-sitter` feature.
|
||
* Contains metrics, structural analysis, imports/exports, comments,
|
||
* docstrings, symbols, diagnostics, and optionally chunked code segments.
|
||
*
|
||
* Stored as an opaque JSON value so that all language bindings (Go, Java,
|
||
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
|
||
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
|
||
*/
|
||
readonly codeIntelligence?: JsonValue
|
||
/**
|
||
* LLM token usage and cost data for all LLM calls made during this extraction.
|
||
*
|
||
* Contains one entry per LLM call. Multiple entries are produced when
|
||
* VLM OCR, structured extraction, or LLM embeddings run during
|
||
* the same extraction.
|
||
*
|
||
* `None` when no LLM was used.
|
||
*/
|
||
readonly llmUsage?: Array<LlmUsage>
|
||
/**
|
||
* Pre-rendered content in the requested output format.
|
||
*
|
||
* Populated during `derive_extraction_result` before tree derivation consumes
|
||
* element data. `apply_output_format` swaps this into `content` at the end
|
||
* of the pipeline, after post-processors have operated on plain text.
|
||
*/
|
||
readonly formattedContent?: string
|
||
/**
|
||
* Structured hOCR document for the OCR+layout pipeline.
|
||
*
|
||
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
|
||
* paragraph structure with bounding boxes and confidence scores. The layout
|
||
* classification step enriches these elements before final rendering.
|
||
*/
|
||
readonly ocrInternalDocument?: string
|
||
}
|
||
|
||
/** FictionBook (FB2) metadata. */
|
||
export interface FictionBookMetadata {
|
||
readonly genres?: Array<string>
|
||
readonly sequences?: Array<string>
|
||
readonly annotation?: string
|
||
}
|
||
|
||
/**
|
||
* Per-file extraction configuration overrides for batch processing.
|
||
*
|
||
* All fields are `Option<T>` — `None` means "use the batch-level default."
|
||
* This type is used with `batch_extract_files` and
|
||
* `batch_extract_bytes` to allow heterogeneous
|
||
* extraction settings within a single batch.
|
||
*
|
||
* # Excluded Fields
|
||
*
|
||
* The following `ExtractionConfig` fields are batch-level only and
|
||
* cannot be overridden per file:
|
||
* - `max_concurrent_extractions` — controls batch parallelism
|
||
* - `use_cache` — global caching policy
|
||
* - `acceleration` — shared ONNX execution provider
|
||
* - `security_limits` — global archive security policy
|
||
*/
|
||
export interface FileExtractionConfig {
|
||
/** Override quality post-processing for this file. */
|
||
readonly enableQualityProcessing?: boolean
|
||
/** Override OCR configuration for this file (None in the Option = use batch default). */
|
||
readonly ocr?: OcrConfig
|
||
/** Override force OCR for this file. */
|
||
readonly forceOcr?: boolean
|
||
/** Override force OCR pages for this file (1-indexed page numbers). */
|
||
readonly forceOcrPages?: Array<number>
|
||
/** Override disable OCR for this file. */
|
||
readonly disableOcr?: boolean
|
||
/** Override chunking configuration for this file. */
|
||
readonly chunking?: ChunkingConfig
|
||
/** Override content filtering configuration for this file. */
|
||
readonly contentFilter?: ContentFilterConfig
|
||
/** Override image extraction configuration for this file. */
|
||
readonly images?: ImageExtractionConfig
|
||
/** Override PDF options for this file. */
|
||
readonly pdfOptions?: PdfConfig
|
||
/** Override token reduction for this file. */
|
||
readonly tokenReduction?: TokenReductionOptions
|
||
/** Override language detection for this file. */
|
||
readonly languageDetection?: LanguageDetectionConfig
|
||
/** Override page extraction for this file. */
|
||
readonly pages?: PageConfig
|
||
/** Override keyword extraction for this file. */
|
||
readonly keywords?: KeywordConfig
|
||
/** Override post-processor for this file. */
|
||
readonly postprocessor?: PostProcessorConfig
|
||
/** Override HTML conversion options for this file. */
|
||
readonly htmlOptions?: string
|
||
/** Override result format for this file. */
|
||
readonly resultFormat?: ResultFormat
|
||
/** Override output content format for this file. */
|
||
readonly outputFormat?: OutputFormat
|
||
/** Override document structure output for this file. */
|
||
readonly includeDocumentStructure?: boolean
|
||
/** Override layout detection for this file. */
|
||
readonly layout?: LayoutDetectionConfig
|
||
/**
|
||
* Override per-file extraction timeout in seconds.
|
||
*
|
||
* When set, the extraction for this file will be canceled after the
|
||
* specified duration. A timed-out file produces an error result without
|
||
* affecting other files in the batch.
|
||
*/
|
||
readonly timeoutSecs?: number
|
||
/** Override tree-sitter configuration for this file. */
|
||
readonly treeSitter?: TreeSitterConfig
|
||
/**
|
||
* Override structured extraction configuration for this file.
|
||
*
|
||
* When set, enables LLM-based structured extraction with a JSON schema
|
||
* for this specific file. The extracted content is sent to a VLM/LLM
|
||
* and the response is parsed according to the provided schema.
|
||
*/
|
||
readonly structuredExtraction?: StructuredExtractionConfig
|
||
}
|
||
|
||
/** Footnote in Djot. */
|
||
export interface Footnote {
|
||
/** Footnote label */
|
||
readonly label: string
|
||
/** Footnote content blocks */
|
||
readonly content: Array<FormattedBlock>
|
||
}
|
||
|
||
/**
|
||
* Format-specific metadata (discriminated union).
|
||
*
|
||
* Only one format type can exist per extraction result. This provides
|
||
* type-safe, clean metadata without nested optionals.
|
||
*/
|
||
export type FormatMetadata =
|
||
| { format_type: 'pdf'; 0: PdfMetadata }
|
||
| { format_type: 'docx'; 0: DocxMetadata }
|
||
| { format_type: 'excel'; 0: ExcelMetadata }
|
||
| { format_type: 'email'; 0: EmailMetadata }
|
||
| { format_type: 'pptx'; 0: PptxMetadata }
|
||
| { format_type: 'archive'; 0: ArchiveMetadata }
|
||
| { format_type: 'image'; 0: ImageMetadata }
|
||
| { format_type: 'xml'; 0: XmlMetadata }
|
||
| { format_type: 'text'; 0: TextMetadata }
|
||
| { format_type: 'html'; 0: HtmlMetadata }
|
||
| { format_type: 'ocr'; 0: OcrMetadata }
|
||
| { format_type: 'csv'; 0: CsvMetadata }
|
||
| { format_type: 'bibtex'; 0: BibtexMetadata }
|
||
| { format_type: 'citation'; 0: CitationMetadata }
|
||
| { format_type: 'fiction_book'; 0: FictionBookMetadata }
|
||
| { format_type: 'dbf'; 0: DbfMetadata }
|
||
| { format_type: 'jats'; 0: JatsMetadata }
|
||
| { format_type: 'epub'; 0: EpubMetadata }
|
||
| { format_type: 'pst'; 0: PstMetadata }
|
||
| { format_type: 'code'; 0: string }
|
||
|
||
/**
|
||
* Block-level element in a Djot document.
|
||
*
|
||
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
||
*/
|
||
export interface FormattedBlock {
|
||
/** Type of block element */
|
||
readonly blockType: BlockType
|
||
/** Heading level (1-6) for headings, or nesting level for lists */
|
||
readonly level?: number
|
||
/** Inline content within the block */
|
||
readonly inlineContent: Array<InlineElement>
|
||
/** Element attributes (classes, IDs, key-value pairs) */
|
||
readonly attributes?: string
|
||
/** Language identifier for code blocks */
|
||
readonly language?: string
|
||
/** Raw code content for code blocks */
|
||
readonly code?: string
|
||
/** Nested blocks for containers (blockquotes, list items, divs) */
|
||
readonly children: Array<FormattedBlock>
|
||
}
|
||
|
||
/** Individual grid cell with position and span metadata. */
|
||
export interface GridCell {
|
||
/** Cell text content. */
|
||
readonly content: string
|
||
/** Zero-indexed row position. */
|
||
readonly row: number
|
||
/** Zero-indexed column position. */
|
||
readonly col: number
|
||
/** Number of rows this cell spans. */
|
||
readonly rowSpan: number
|
||
/** Number of columns this cell spans. */
|
||
readonly colSpan: number
|
||
/** Whether this is a header cell. */
|
||
readonly isHeader: boolean
|
||
/** Bounding box for this cell (if available). */
|
||
readonly bbox?: BoundingBox
|
||
}
|
||
|
||
/** Header/heading element metadata. */
|
||
export interface HeaderMetadata {
|
||
/** Header level: 1 (h1) through 6 (h6) */
|
||
readonly level: number
|
||
/** Normalized text content of the header */
|
||
readonly text: string
|
||
/** HTML id attribute if present */
|
||
readonly id?: string
|
||
/** Document tree depth at the header element */
|
||
readonly depth: number
|
||
/** Byte offset in original HTML document */
|
||
readonly htmlOffset: number
|
||
}
|
||
|
||
/**
|
||
* Heading context for a chunk within a Markdown document.
|
||
*
|
||
* Contains the heading hierarchy from document root to this chunk's section.
|
||
*/
|
||
export interface HeadingContext {
|
||
/**
|
||
* The heading hierarchy from document root to this chunk's section.
|
||
* Index 0 is the outermost (h1), last element is the most specific.
|
||
*/
|
||
readonly headings: Array<HeadingLevel>
|
||
}
|
||
|
||
/** A single heading in the hierarchy. */
|
||
export interface HeadingLevel {
|
||
/** Heading depth (1 = h1, 2 = h2, etc.) */
|
||
readonly level: number
|
||
/** The text content of the heading. */
|
||
readonly text: string
|
||
}
|
||
|
||
/**
|
||
* A text block with hierarchy level assignment.
|
||
*
|
||
* Represents a block of text with semantic heading information extracted from
|
||
* font size clustering and hierarchical analysis.
|
||
*/
|
||
export interface HierarchicalBlock {
|
||
/** The text content of this block */
|
||
readonly text: string
|
||
/** The font size of the text in this block */
|
||
readonly fontSize: number
|
||
/**
|
||
* The hierarchy level of this block (H1-H6 or Body)
|
||
*
|
||
* Levels correspond to HTML heading tags:
|
||
* - "h1": Top-level heading
|
||
* - "h2": Secondary heading
|
||
* - "h3": Tertiary heading
|
||
* - "h4": Quaternary heading
|
||
* - "h5": Quinary heading
|
||
* - "h6": Senary heading
|
||
* - "body": Body text (no heading level)
|
||
*/
|
||
readonly level: string
|
||
/**
|
||
* Bounding box information for the block
|
||
*
|
||
* Contains coordinates as (left, top, right, bottom) in PDF units.
|
||
*/
|
||
readonly bbox?: Array<number>
|
||
}
|
||
|
||
/**
|
||
* Hierarchy extraction configuration for PDF text structure analysis.
|
||
*
|
||
* Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||
* clustering and semantic analysis. When enabled, hierarchical blocks are
|
||
* included in page content.
|
||
*/
|
||
export interface HierarchyConfig {
|
||
/** Enable hierarchy extraction */
|
||
readonly enabled?: boolean
|
||
/**
|
||
* Number of font size clusters to use for hierarchy levels (1-7)
|
||
*
|
||
* Default: 6, which provides H1-H6 heading levels with body text.
|
||
* Larger values create more fine-grained hierarchy levels.
|
||
*/
|
||
readonly kClusters?: number
|
||
/** Include bounding box information in hierarchy blocks */
|
||
readonly includeBbox?: boolean
|
||
/**
|
||
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
||
*
|
||
* Determines when OCR should be triggered based on text block coverage.
|
||
* OCR is triggered when text blocks cover less than this fraction of the page.
|
||
* Default: 0.5 (trigger OCR if less than 50% of page has text)
|
||
*/
|
||
readonly ocrCoverageThreshold?: number
|
||
}
|
||
|
||
/**
|
||
* HTML metadata extracted from HTML documents.
|
||
*
|
||
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
||
* and extracted structural elements (headers, links, images, structured data).
|
||
*/
|
||
export interface HtmlMetadata {
|
||
/** Document title from `<title>` tag */
|
||
readonly title?: string
|
||
/** Document description from `<meta name="description">` tag */
|
||
readonly description?: string
|
||
/** Document keywords from `<meta name="keywords">` tag, split on commas */
|
||
readonly keywords?: Array<string>
|
||
/** Document author from `<meta name="author">` tag */
|
||
readonly author?: string
|
||
/** Canonical URL from `<link rel="canonical">` tag */
|
||
readonly canonicalUrl?: string
|
||
/** Base URL from `<base href="">` tag for resolving relative URLs */
|
||
readonly baseHref?: string
|
||
/** Document language from `lang` attribute */
|
||
readonly language?: string
|
||
/** Document text direction from `dir` attribute */
|
||
readonly textDirection?: TextDirection
|
||
/**
|
||
* Open Graph metadata (og:* properties) for social media
|
||
* Keys like "title", "description", "image", "url", etc.
|
||
*/
|
||
readonly openGraph?: Record<string, string>
|
||
/**
|
||
* Twitter Card metadata (twitter:* properties)
|
||
* Keys like "card", "site", "creator", "title", "description", "image", etc.
|
||
*/
|
||
readonly twitterCard?: Record<string, string>
|
||
/**
|
||
* Additional meta tags not covered by specific fields
|
||
* Keys are meta name/property attributes, values are content
|
||
*/
|
||
readonly metaTags?: Record<string, string>
|
||
/** Extracted header elements with hierarchy */
|
||
readonly headers?: Array<HeaderMetadata>
|
||
/** Extracted hyperlinks with type classification */
|
||
readonly links?: Array<LinkMetadata>
|
||
/** Extracted images with source and dimensions */
|
||
readonly images?: Array<ImageMetadataType>
|
||
/** Extracted structured data blocks */
|
||
readonly structuredData?: Array<StructuredData>
|
||
}
|
||
|
||
/**
|
||
* Configuration for styled HTML output.
|
||
*
|
||
* When set on [`ExtractionConfig::html_output`] alongside
|
||
* `output_format = OutputFormat::Html`, the pipeline builds a
|
||
* [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
||
* the plain comrak-based renderer.
|
||
*/
|
||
export interface HtmlOutputConfig {
|
||
/**
|
||
* Inline CSS string injected into the output after the theme stylesheet.
|
||
* Concatenated after `css_file` content when both are set.
|
||
*/
|
||
readonly css?: string
|
||
/**
|
||
* Path to a CSS file loaded once at renderer construction time.
|
||
* Concatenated before `css` when both are set.
|
||
*/
|
||
readonly cssFile?: string
|
||
/** Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
|
||
readonly theme?: HtmlTheme
|
||
/**
|
||
* CSS class prefix applied to every emitted class name.
|
||
*
|
||
* Default: `"kb-"`. Change this if your host application already uses
|
||
* classes that start with `kb-`.
|
||
*/
|
||
readonly classPrefix?: string
|
||
/**
|
||
* When `true` (default), write the resolved CSS into a `<style>` block
|
||
* immediately after the opening `<div class="{prefix}doc">`.
|
||
*
|
||
* Set to `false` to emit only the structural markup and wire up your
|
||
* own stylesheet targeting the `kb-*` class names.
|
||
*/
|
||
readonly embedCss?: boolean
|
||
}
|
||
|
||
/** Built-in HTML theme selection. */
|
||
export declare enum HtmlTheme {
|
||
/**
|
||
* Sensible defaults: system font stack, neutral colours, readable line
|
||
* measure. CSS custom properties (`--kb-*`) are all defined so user CSS
|
||
* can override individual values.
|
||
*/
|
||
Default = "default",
|
||
/** GitHub Markdown-inspired palette and spacing. */
|
||
GitHub = "github",
|
||
/** Dark background, light text. */
|
||
Dark = "dark",
|
||
/** Minimal light theme with generous whitespace. */
|
||
Light = "light",
|
||
/**
|
||
* No built-in stylesheet emitted. CSS custom properties are still defined
|
||
* on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
|
||
*/
|
||
Unstyled = "unstyled",
|
||
}
|
||
|
||
/** Image extraction configuration. */
|
||
export interface ImageExtractionConfig {
|
||
/** Extract images from documents */
|
||
readonly extractImages?: boolean
|
||
/** Target DPI for image normalization */
|
||
readonly targetDpi?: number
|
||
/** Maximum dimension for images (width or height) */
|
||
readonly maxImageDimension?: number
|
||
/**
|
||
* Whether to inject image reference placeholders into markdown output.
|
||
* When `true` (default), image references like ``
|
||
* are appended to the markdown. Set to `false` to extract images as data
|
||
* without polluting the markdown output.
|
||
*/
|
||
readonly injectPlaceholders?: boolean
|
||
/** Automatically adjust DPI based on image content */
|
||
readonly autoAdjustDpi?: boolean
|
||
/** Minimum DPI threshold */
|
||
readonly minDpi?: number
|
||
/** Maximum DPI threshold */
|
||
readonly maxDpi?: number
|
||
/**
|
||
* Maximum number of image objects to extract per PDF page.
|
||
*
|
||
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
||
* can trigger extremely long or indefinite extraction times when every image
|
||
* object on a dense page is decoded individually via the PDF extractor. Setting this
|
||
* limit causes kreuzberg to stop collecting individual images once the count
|
||
* per page reaches the cap and emit a warning instead.
|
||
*
|
||
* `None` (default) means no limit — all images are extracted.
|
||
*/
|
||
readonly maxImagesPerPage?: number
|
||
/**
|
||
* When `true` (default), extracted images are classified by kind and grouped
|
||
* into clusters where they appear to belong to one figure.
|
||
*/
|
||
readonly classify?: boolean
|
||
/**
|
||
* When `true`, full-page renders produced during OCR preprocessing are captured
|
||
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
|
||
*
|
||
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
||
* document-level OCR bypass is active (whole-document backend). When OCR is
|
||
* enabled and this flag is set but the active backend skips per-page rendering,
|
||
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
|
||
*
|
||
* Defaults to `false`. Enable when downstream consumers need page thumbnails
|
||
* (e.g. citation previews, visual grounding).
|
||
*/
|
||
readonly includePageRasters?: boolean
|
||
/**
|
||
* Run OCR on extracted images and include the recognized text in the document content.
|
||
*
|
||
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
|
||
* are processed with the configured OCR backend. Set to `false` to extract images
|
||
* without OCR processing, even when OCR is enabled.
|
||
*/
|
||
readonly runOcrOnImages?: boolean
|
||
/**
|
||
* When `true`, image OCR results are rendered as plain text without the
|
||
* `` markdown placeholder. Only takes effect when `run_ocr_on_images`
|
||
* is also `true`.
|
||
*/
|
||
readonly ocrTextOnly?: boolean
|
||
/**
|
||
* When `true` and `ocr_text_only` is `false`, append the OCR text after
|
||
* the image placeholder in the rendered output.
|
||
*/
|
||
readonly appendOcrText?: boolean
|
||
}
|
||
|
||
/** Heuristic classification of what an image likely depicts. */
|
||
export declare enum ImageKind {
|
||
/** Photographic image (natural scene, photograph) */
|
||
Photograph = "photograph",
|
||
/** Technical or schematic diagram */
|
||
Diagram = "diagram",
|
||
/** Chart, graph, or plot */
|
||
Chart = "chart",
|
||
/** Freehand or technical drawing */
|
||
Drawing = "drawing",
|
||
/** Text-heavy image (scanned text, document) */
|
||
TextBlock = "text_block",
|
||
/** Decorative element or border */
|
||
Decoration = "decoration",
|
||
/** Logo or brand mark */
|
||
Logo = "logo",
|
||
/** Small icon */
|
||
Icon = "icon",
|
||
/** Fragment of a larger tiled image (tile of a technical drawing) */
|
||
TileFragment = "tile_fragment",
|
||
/** Mask or transparency map */
|
||
Mask = "mask",
|
||
/** Full-page render produced during OCR preprocessing; used as a citation thumbnail. */
|
||
PageRaster = "page_raster",
|
||
/** Could not classify with reasonable confidence */
|
||
Unknown = "unknown",
|
||
}
|
||
|
||
/**
|
||
* Image metadata extracted from image files.
|
||
*
|
||
* Includes dimensions, format, and EXIF data.
|
||
*/
|
||
export interface ImageMetadata {
|
||
/** Image width in pixels */
|
||
readonly width?: number
|
||
/** Image height in pixels */
|
||
readonly height?: number
|
||
/** Image format (e.g., "PNG", "JPEG", "TIFF") */
|
||
readonly format?: string
|
||
/** EXIF metadata tags */
|
||
readonly exif?: Record<string, string>
|
||
}
|
||
|
||
/** Image element metadata. */
|
||
export interface ImageMetadataType {
|
||
/** Image source (URL, data URI, or SVG content) */
|
||
readonly src: string
|
||
/** Alternative text from alt attribute */
|
||
readonly alt?: string
|
||
/** Title attribute */
|
||
readonly title?: string
|
||
/** Image dimensions as (width, height) if available */
|
||
readonly dimensions?: Array<number>
|
||
/** Image type classification */
|
||
readonly imageType: ImageType
|
||
/** Additional attributes as key-value pairs */
|
||
readonly attributes: Array<Array<string>>
|
||
}
|
||
|
||
/**
|
||
* Image preprocessing configuration for OCR.
|
||
*
|
||
* These settings control how images are preprocessed before OCR to improve
|
||
* text recognition quality. Different preprocessing strategies work better
|
||
* for different document types.
|
||
*/
|
||
export interface ImagePreprocessingConfig {
|
||
/** Target DPI for the image (300 is standard, 600 for small text). */
|
||
readonly targetDpi?: number
|
||
/** Auto-detect and correct image rotation. */
|
||
readonly autoRotate?: boolean
|
||
/** Correct skew (tilted images). */
|
||
readonly deskew?: boolean
|
||
/** Remove noise from the image. */
|
||
readonly denoise?: boolean
|
||
/** Enhance contrast for better text visibility. */
|
||
readonly contrastEnhance?: boolean
|
||
/** Binarization method: "otsu", "sauvola", "adaptive". */
|
||
readonly binarizationMethod?: string
|
||
/** Invert colors (white text on black → black on white). */
|
||
readonly invertColors?: boolean
|
||
}
|
||
|
||
/**
|
||
* Image preprocessing metadata.
|
||
*
|
||
* Tracks the transformations applied to an image during OCR preprocessing,
|
||
* including DPI normalization, resizing, and resampling.
|
||
*/
|
||
export interface ImagePreprocessingMetadata {
|
||
/** Original image dimensions (width, height) in pixels */
|
||
readonly originalDimensions: Array<number>
|
||
/** Original image DPI (horizontal, vertical) */
|
||
readonly originalDpi: Array<number>
|
||
/** Target DPI from configuration */
|
||
readonly targetDpi: number
|
||
/** Scaling factor applied to the image */
|
||
readonly scaleFactor: number
|
||
/** Whether DPI was auto-adjusted based on content */
|
||
readonly autoAdjusted: boolean
|
||
/** Final DPI after processing */
|
||
readonly finalDpi: number
|
||
/** New dimensions after resizing (if resized) */
|
||
readonly newDimensions?: Array<number>
|
||
/** Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
|
||
readonly resampleMethod: string
|
||
/** Whether dimensions were clamped to max_image_dimension */
|
||
readonly dimensionClamped: boolean
|
||
/** Calculated optimal DPI (if auto_adjust_dpi enabled) */
|
||
readonly calculatedDpi?: number
|
||
/** Whether resize was skipped (dimensions already optimal) */
|
||
readonly skippedResize: boolean
|
||
/** Error message if resize failed */
|
||
readonly resizeError?: string
|
||
}
|
||
|
||
/** Image type classification. */
|
||
export declare enum ImageType {
|
||
/** Data URI image */
|
||
DataUri = "data-uri",
|
||
/** Inline SVG */
|
||
InlineSvg = "inline-svg",
|
||
/** External image URL */
|
||
External = "external",
|
||
/** Relative path image */
|
||
Relative = "relative",
|
||
}
|
||
|
||
/**
|
||
* Inline element within a block.
|
||
*
|
||
* Represents text with formatting, links, images, etc.
|
||
*/
|
||
export interface InlineElement {
|
||
/** Type of inline element */
|
||
readonly elementType: InlineType
|
||
/** Text content */
|
||
readonly content: string
|
||
/** Element attributes */
|
||
readonly attributes?: string
|
||
/** Additional metadata (e.g., href for links, src/alt for images) */
|
||
readonly metadata?: Record<string, string>
|
||
}
|
||
|
||
/** Types of inline elements in Djot. */
|
||
export declare enum InlineType {
|
||
Text = "text",
|
||
Strong = "strong",
|
||
Emphasis = "emphasis",
|
||
Highlight = "highlight",
|
||
Subscript = "subscript",
|
||
Superscript = "superscript",
|
||
Insert = "insert",
|
||
Delete = "delete",
|
||
Code = "code",
|
||
Link = "link",
|
||
Image = "image",
|
||
Span = "span",
|
||
Math = "math",
|
||
RawInline = "raw_inline",
|
||
FootnoteRef = "footnote_ref",
|
||
Symbol = "symbol",
|
||
}
|
||
|
||
/** JATS (Journal Article Tag Suite) metadata. */
|
||
export interface JatsMetadata {
|
||
readonly copyright?: string
|
||
readonly license?: string
|
||
readonly historyDates?: Record<string, string>
|
||
readonly contributorRoles?: Array<ContributorRole>
|
||
}
|
||
|
||
/** Extracted keyword with metadata. */
|
||
export interface Keyword {
|
||
/** The keyword text. */
|
||
readonly text: string
|
||
/** Relevance score (higher is better, algorithm-specific range). */
|
||
readonly score: number
|
||
/** Algorithm that extracted this keyword. */
|
||
readonly algorithm: KeywordAlgorithm
|
||
/** Optional positions where keyword appears in text (character offsets). */
|
||
readonly positions?: Array<number>
|
||
}
|
||
|
||
/** Keyword algorithm selection. */
|
||
export declare enum KeywordAlgorithm {
|
||
/** YAKE (Yet Another Keyword Extractor) - statistical approach */
|
||
Yake = "yake",
|
||
/** RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based */
|
||
Rake = "rake",
|
||
}
|
||
|
||
/** Keyword extraction configuration. */
|
||
export interface KeywordConfig {
|
||
/** Algorithm to use for extraction. */
|
||
readonly algorithm?: KeywordAlgorithm
|
||
/** Maximum number of keywords to extract (default: 10). */
|
||
readonly maxKeywords?: number
|
||
/**
|
||
* Minimum score threshold (0.0-1.0, default: 0.0).
|
||
*
|
||
* Keywords with scores below this threshold are filtered out.
|
||
* Note: Score ranges differ between algorithms.
|
||
*/
|
||
readonly minScore?: number
|
||
/**
|
||
* N-gram range for keyword extraction (min, max).
|
||
*
|
||
* (1, 1) = unigrams only
|
||
* (1, 2) = unigrams and bigrams
|
||
* (1, 3) = unigrams, bigrams, and trigrams (default)
|
||
*/
|
||
readonly ngramRange?: Array<number>
|
||
/**
|
||
* Language code for stopword filtering (e.g., "en", "de", "fr").
|
||
*
|
||
* If None, no stopword filtering is applied.
|
||
*/
|
||
readonly language?: string
|
||
/** YAKE-specific tuning parameters. */
|
||
readonly yakeParams?: YakeParams
|
||
/** RAKE-specific tuning parameters. */
|
||
readonly rakeParams?: RakeParams
|
||
}
|
||
|
||
/** Language detection configuration. */
|
||
export interface LanguageDetectionConfig {
|
||
/** Enable language detection */
|
||
readonly enabled?: boolean
|
||
/** Minimum confidence threshold (0.0-1.0) */
|
||
readonly minConfidence?: number
|
||
/** Detect multiple languages in the document */
|
||
readonly detectMultiple?: boolean
|
||
}
|
||
|
||
/**
|
||
* The 17 canonical document layout classes.
|
||
*
|
||
* All model backends (RT-DETR, YOLO, etc.) map their native class IDs
|
||
* to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
|
||
* map to the closest equivalent.
|
||
*
|
||
* Wire format is snake_case in all serializers (JSON, TOML, YAML).
|
||
*/
|
||
export declare enum LayoutClass {
|
||
Caption = "caption",
|
||
Footnote = "footnote",
|
||
Formula = "formula",
|
||
ListItem = "list_item",
|
||
PageFooter = "page_footer",
|
||
PageHeader = "page_header",
|
||
Picture = "picture",
|
||
SectionHeader = "section_header",
|
||
Table = "table",
|
||
Text = "text",
|
||
Title = "title",
|
||
DocumentIndex = "document_index",
|
||
Code = "code",
|
||
CheckboxSelected = "checkbox_selected",
|
||
CheckboxUnselected = "checkbox_unselected",
|
||
Form = "form",
|
||
KeyValueRegion = "key_value_region",
|
||
}
|
||
|
||
/** A single layout detection result. */
|
||
export interface LayoutDetection {
|
||
readonly className: LayoutClass
|
||
readonly confidence: number
|
||
readonly bbox: BBox
|
||
}
|
||
|
||
/**
|
||
* Layout detection configuration.
|
||
*
|
||
* Controls layout detection behavior in the extraction pipeline.
|
||
* When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
||
* is enabled for PDF extraction.
|
||
*/
|
||
export interface LayoutDetectionConfig {
|
||
/** Confidence threshold override (None = use model default). */
|
||
readonly confidenceThreshold?: number
|
||
/** Whether to apply postprocessing heuristics (default: true). */
|
||
readonly applyHeuristics?: boolean
|
||
/**
|
||
* Table structure recognition model.
|
||
*
|
||
* Controls which model is used for table cell detection within layout-detected
|
||
* table regions. Defaults to [`TableModel::Tatr`].
|
||
*/
|
||
readonly tableModel?: TableModel
|
||
/**
|
||
* Hardware acceleration for ONNX models (layout detection + table structure).
|
||
*
|
||
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
||
* is used for inference. Defaults to `None` (auto-select per platform).
|
||
*/
|
||
readonly acceleration?: AccelerationConfig
|
||
}
|
||
|
||
/**
|
||
* A detected layout region on a page.
|
||
*
|
||
* When layout detection is enabled, each page may have layout regions
|
||
* identifying different content types (text, pictures, tables, etc.)
|
||
* with confidence scores and spatial positions.
|
||
*/
|
||
export interface LayoutRegion {
|
||
/** Layout class name (e.g. "picture", "table", "text", "section_header"). */
|
||
readonly className?: string
|
||
/** Confidence score from the layout detection model (0.0 to 1.0). */
|
||
readonly confidence?: number
|
||
/** Bounding box in document coordinate space. */
|
||
readonly boundingBox?: BoundingBox
|
||
/** Fraction of the page area covered by this region (0.0 to 1.0). */
|
||
readonly areaFraction?: number
|
||
}
|
||
|
||
/** Link element metadata. */
|
||
export interface LinkMetadata {
|
||
/** The href URL value */
|
||
readonly href: string
|
||
/** Link text content (normalized) */
|
||
readonly text: string
|
||
/** Optional title attribute */
|
||
readonly title?: string
|
||
/** Link type classification */
|
||
readonly linkType: LinkType
|
||
/** Rel attribute values */
|
||
readonly rel: Array<string>
|
||
/** Additional attributes as key-value pairs */
|
||
readonly attributes: Array<Array<string>>
|
||
}
|
||
|
||
/** Link type classification. */
|
||
export declare enum LinkType {
|
||
/** Anchor link (#section) */
|
||
Anchor = "anchor",
|
||
/** Internal link (same domain) */
|
||
Internal = "internal",
|
||
/** External link (different domain) */
|
||
External = "external",
|
||
/** Email link (mailto:) */
|
||
Email = "email",
|
||
/** Phone link (tel:) */
|
||
Phone = "phone",
|
||
/** Other link type */
|
||
Other = "other",
|
||
}
|
||
|
||
/** Type of list detection. */
|
||
export declare enum ListType {
|
||
/** Bullet points (-, *, •, etc.) */
|
||
Bullet = "Bullet",
|
||
/** Numbered lists (1., 2., etc.) */
|
||
Numbered = "Numbered",
|
||
/** Lettered lists (a., b., A., B., etc.) */
|
||
Lettered = "Lettered",
|
||
/** Indented items */
|
||
Indented = "Indented",
|
||
}
|
||
|
||
/**
|
||
* Configuration for an LLM provider/model via liter-llm.
|
||
*
|
||
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
||
* its own `LlmConfig`, allowing different providers per feature.
|
||
* @example
|
||
* ```typescript
|
||
* [structured_extraction.llm]
|
||
* model = "openai/gpt-4o"
|
||
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
|
||
* ```typescript
|
||
*/
|
||
export interface LlmConfig {
|
||
/**
|
||
* Provider/model string using liter-llm routing format.
|
||
*
|
||
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
|
||
* `"groq/llama-3.1-70b-versatile"`.
|
||
*/
|
||
readonly model?: string
|
||
/**
|
||
* API key for the provider. When `None`, liter-llm falls back to
|
||
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
|
||
*/
|
||
readonly apiKey?: string
|
||
/** Custom base URL override for the provider endpoint. */
|
||
readonly baseUrl?: string
|
||
/** Request timeout in seconds (default: 60). */
|
||
readonly timeoutSecs?: number
|
||
/** Maximum retry attempts (default: 3). */
|
||
readonly maxRetries?: number
|
||
/** Sampling temperature for generation tasks. */
|
||
readonly temperature?: number
|
||
/** Maximum tokens to generate. */
|
||
readonly maxTokens?: number
|
||
}
|
||
|
||
/**
|
||
* Token usage and cost data for a single LLM call made during extraction.
|
||
*
|
||
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
||
* are used. Multiple entries may be present when multiple LLM calls occur
|
||
* within one extraction (e.g. VLM OCR + structured extraction).
|
||
*/
|
||
export interface LlmUsage {
|
||
/** The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
|
||
readonly model?: string
|
||
/**
|
||
* The pipeline stage that triggered this LLM call
|
||
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
|
||
*/
|
||
readonly source?: string
|
||
/** Number of input/prompt tokens consumed. */
|
||
readonly inputTokens?: number
|
||
/** Number of output/completion tokens generated. */
|
||
readonly outputTokens?: number
|
||
/** Total tokens (input + output). */
|
||
readonly totalTokens?: number
|
||
/** Estimated cost in USD based on the provider's published pricing. */
|
||
readonly estimatedCost?: number
|
||
/** Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
|
||
readonly finishReason?: string
|
||
}
|
||
|
||
/**
|
||
* Extraction result metadata.
|
||
*
|
||
* Contains common fields applicable to all formats, format-specific metadata
|
||
* via a discriminated union, and additional custom fields from postprocessors.
|
||
*/
|
||
export interface Metadata {
|
||
/** Document title */
|
||
readonly title?: string
|
||
/** Document subject or description */
|
||
readonly subject?: string
|
||
/** Primary author(s) - always Vec for consistency */
|
||
readonly authors?: Array<string>
|
||
/** Keywords/tags - always Vec for consistency */
|
||
readonly keywords?: Array<string>
|
||
/** Primary language (ISO 639 code) */
|
||
readonly language?: string
|
||
/** Creation timestamp (ISO 8601 format) */
|
||
readonly createdAt?: string
|
||
/** Last modification timestamp (ISO 8601 format) */
|
||
readonly modifiedAt?: string
|
||
/** User who created the document */
|
||
readonly createdBy?: string
|
||
/** User who last modified the document */
|
||
readonly modifiedBy?: string
|
||
/** Page/slide/sheet structure with boundaries */
|
||
readonly pages?: PageStructure
|
||
/**
|
||
* Format-specific metadata (discriminated union)
|
||
*
|
||
* Contains detailed metadata specific to the document format.
|
||
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
|
||
*/
|
||
readonly format?: FormatMetadata
|
||
/** Image preprocessing metadata (when OCR preprocessing was applied) */
|
||
readonly imagePreprocessing?: ImagePreprocessingMetadata
|
||
/** JSON schema (for structured data extraction) */
|
||
readonly jsonSchema?: JsonValue
|
||
/** Error metadata (for batch operations) */
|
||
readonly error?: ErrorMetadata
|
||
/**
|
||
* Extraction duration in milliseconds (for benchmarking).
|
||
*
|
||
* This field is populated by batch extraction to provide per-file timing
|
||
* information. It's `None` for single-file extraction (which uses external timing).
|
||
*/
|
||
readonly extractionDurationMs?: number
|
||
/** Document category (from frontmatter or classification). */
|
||
readonly category?: string
|
||
/** Document tags (from frontmatter). */
|
||
readonly tags?: Array<string>
|
||
/** Document version string (from frontmatter). */
|
||
readonly documentVersion?: string
|
||
/** Abstract or summary text (from frontmatter). */
|
||
readonly abstractText?: string
|
||
/**
|
||
* Output format identifier (e.g., "markdown", "html", "text").
|
||
*
|
||
* Set by the output format pipeline stage when format conversion is applied.
|
||
* Previously stored in `metadata.additional["output_format"]`.
|
||
*/
|
||
readonly outputFormat?: string
|
||
/**
|
||
* Whether OCR was used during extraction.
|
||
*
|
||
* Set to `true` whenever the extraction pipeline ran an OCR backend
|
||
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
|
||
* or fallback text. `false` means native text extraction was used exclusively.
|
||
*/
|
||
readonly ocrUsed?: boolean
|
||
/**
|
||
* Additional custom fields from postprocessors.
|
||
*
|
||
* Serialized as a nested `"additional"` object (not flattened at root level).
|
||
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
|
||
*/
|
||
readonly additional?: Record<string, JsonValue>
|
||
}
|
||
|
||
/** Combined paths to all models needed for OCR (backward compatibility). */
|
||
export interface ModelPaths {
|
||
/** Path to the detection model directory. */
|
||
readonly detModel: string
|
||
/** Path to the classification model directory. */
|
||
readonly clsModel: string
|
||
/** Path to the recognition model directory. */
|
||
readonly recModel: string
|
||
/** Path to the character dictionary file. */
|
||
readonly dictFile: string
|
||
}
|
||
|
||
/**
|
||
* Tagged enum for node content. Each variant carries only type-specific data.
|
||
*
|
||
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
||
* Go/Java/TypeScript bindings.
|
||
*/
|
||
export type NodeContent =
|
||
| { node_type: 'title'; text: string }
|
||
| { node_type: 'heading'; level: number; text: string }
|
||
| { node_type: 'paragraph'; text: string }
|
||
| { node_type: 'list'; ordered: boolean }
|
||
| { node_type: 'list_item'; text: string }
|
||
| { node_type: 'table'; grid: TableGrid }
|
||
| { node_type: 'image'; description: string; imageIndex: number; src: string }
|
||
| { node_type: 'code'; text: string; language: string }
|
||
| { node_type: 'quote' }
|
||
| { node_type: 'formula'; text: string }
|
||
| { node_type: 'footnote'; text: string }
|
||
| { node_type: 'group'; label: string; headingLevel: number; headingText: string }
|
||
| { node_type: 'page_break' }
|
||
| { node_type: 'slide'; number: number; title: string }
|
||
| { node_type: 'definition_list' }
|
||
| { node_type: 'definition_item'; term: string; definition: string }
|
||
| { node_type: 'citation'; key: string; text: string }
|
||
| { node_type: 'admonition'; kind: string; title: string }
|
||
| { node_type: 'raw_block'; format: string; content: string }
|
||
| { node_type: 'metadata_block'; entries: Array<Array<string>> }
|
||
|
||
/**
|
||
* Trait for OCR backend plugins.
|
||
*
|
||
* Implement this trait to add custom OCR capabilities. OCR backends can be:
|
||
* - Native Rust implementations (like Tesseract)
|
||
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
|
||
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
|
||
*/
|
||
export interface OcrBackend {
|
||
name(): string
|
||
/**
|
||
* Process an image and extract text via OCR.
|
||
* @param image_bytes - Raw image data (JPEG, PNG, TIFF, etc.)
|
||
*
|
||
* @param config - OCR configuration (language, PSM mode, etc.)
|
||
*
|
||
* @returns An `ExtractionResult` containing the extracted text and metadata.
|
||
*
|
||
* @throws - `KreuzbergError::Ocr` - OCR processing failed
|
||
* - `KreuzbergError::Validation` - Invalid image format or configuration
|
||
* - `KreuzbergError::Io` - I/O errors (these always bubble up)
|
||
*
|
||
* # Reading `backend_options`
|
||
*
|
||
* Backends that support runtime tuning can read `config.backend_options` and
|
||
* deserialize only the keys they care about. Unknown keys are silently ignored,
|
||
* so multiple backends can coexist in a pipeline without key conflicts.
|
||
*
|
||
* ```rust
|
||
* async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
||
* // Read backend-specific options; unknown keys are silently ignored.
|
||
* let fast_mode = config.backend_options
|
||
* .as_ref()
|
||
* .and_then(|v| v.get("mode"))
|
||
* .and_then(|v| v.as_str())
|
||
* .map(|s| s == "fast")
|
||
* .unwrap_or(false);
|
||
*
|
||
* if image_bytes.is_empty() {
|
||
* return Err(kreuzberg::KreuzbergError::Validation {
|
||
* message: "Empty image data".to_string(),
|
||
* source: None,
|
||
* });
|
||
* }
|
||
*
|
||
* let text = if fast_mode {
|
||
* "Fast OCR result".to_string()
|
||
* } else {
|
||
* format!("Extracted text in language: {}", config.language)
|
||
* };
|
||
*
|
||
* Ok(ExtractionResult {
|
||
* content: text,
|
||
* mime_type: Cow::Borrowed("text/plain"),
|
||
* ..Default::default()
|
||
* })
|
||
* }
|
||
* ```
|
||
*/
|
||
processImage(imageBytes: Uint8Array, config?: OcrConfig | undefined | null): Promise<string>
|
||
/**
|
||
* Process a file and extract text via OCR.
|
||
*
|
||
* Default implementation reads the file and calls `process_image`.
|
||
* Override for custom file handling or optimizations.
|
||
* @param path - Path to the image file
|
||
*
|
||
* @param config - OCR configuration
|
||
*
|
||
* @throws Same as `process_image`, plus file I/O errors.
|
||
*/
|
||
processImageFile?(path: string, config?: OcrConfig | undefined | null): Promise<string>
|
||
/**
|
||
* Check if this backend supports a given language code.
|
||
* @param lang - ISO 639-2/3 language code (e.g., "eng", "deu", "fra")
|
||
*
|
||
* @returns `true` if the language is supported, `false` otherwise.
|
||
*/
|
||
supportsLanguage(lang: string): string
|
||
/**
|
||
* Get the backend type identifier.
|
||
* @returns The backend type enum value.
|
||
*/
|
||
backendType(): string
|
||
/**
|
||
* Optional: Get a list of all supported languages.
|
||
*
|
||
* Defaults to empty list. Override to provide comprehensive language support info.
|
||
*/
|
||
supportedLanguages?(): string
|
||
/**
|
||
* Optional: Check if the backend supports table detection.
|
||
*
|
||
* Defaults to `false`. Override if your backend can detect and extract tables.
|
||
*/
|
||
supportsTableDetection?(): string
|
||
/**
|
||
* Check if the backend supports direct document-level processing (e.g. for PDFs).
|
||
*
|
||
* Defaults to `false`. Override if the backend has optimized document processing.
|
||
*/
|
||
supportsDocumentProcessing?(): string
|
||
/**
|
||
* Process a document file directly via OCR.
|
||
*
|
||
* Only called if `supports_document_processing` returns `true`.
|
||
* @param path - Path to the document file (e.g. .pdf)
|
||
*
|
||
* @param config - OCR configuration
|
||
*/
|
||
processDocument?(path: string, config?: OcrConfig | undefined | null): Promise<string>
|
||
}
|
||
|
||
/** OCR backend types. */
|
||
export declare enum OcrBackendType {
|
||
/** Tesseract OCR (native Rust binding) */
|
||
Tesseract = "Tesseract",
|
||
/** EasyOCR (Python-based, via FFI) */
|
||
EasyOCR = "EasyOCR",
|
||
/** PaddleOCR (Python-based, via FFI) */
|
||
PaddleOCR = "PaddleOCR",
|
||
/** Custom/third-party OCR backend */
|
||
Custom = "Custom",
|
||
}
|
||
|
||
/**
|
||
* Bounding geometry for an OCR element.
|
||
*
|
||
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
||
* (from PaddleOCR and rotated text detection).
|
||
*/
|
||
export type OcrBoundingGeometry =
|
||
| { type: 'rectangle'; left: number; top: number; width: number; height: number }
|
||
| { type: 'quadrilateral'; points: string }
|
||
|
||
/**
|
||
* Confidence scores for an OCR element.
|
||
*
|
||
* Separates detection confidence (how confident that text exists at this location)
|
||
* from recognition confidence (how confident about the actual text content).
|
||
*/
|
||
export interface OcrConfidence {
|
||
/**
|
||
* Detection confidence: how confident the OCR engine is that text exists here.
|
||
*
|
||
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
|
||
* Range: 0.0 to 1.0 (or None if not available).
|
||
*/
|
||
readonly detection?: number
|
||
/**
|
||
* Recognition confidence: how confident about the text content.
|
||
*
|
||
* Range: 0.0 to 1.0.
|
||
*/
|
||
readonly recognition?: number
|
||
}
|
||
|
||
/** OCR configuration. */
|
||
export interface OcrConfig {
|
||
/**
|
||
* Whether OCR is enabled.
|
||
*
|
||
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
|
||
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
|
||
* metadata only; PDFs use native text extraction without OCR fallback.
|
||
*
|
||
* Defaults to `true`. When `false`, all other OCR settings are ignored.
|
||
*/
|
||
readonly enabled?: boolean
|
||
/** OCR backend: tesseract, easyocr, paddleocr */
|
||
readonly backend?: string
|
||
/** Language code (e.g., "eng", "deu") */
|
||
readonly language?: string
|
||
/** Tesseract-specific configuration (optional) */
|
||
readonly tesseractConfig?: TesseractConfig
|
||
/** Output format for OCR results (optional, for format conversion) */
|
||
readonly outputFormat?: OutputFormat
|
||
/** PaddleOCR-specific configuration (optional, JSON passthrough) */
|
||
readonly paddleOcrConfig?: JsonValue
|
||
/**
|
||
* Arbitrary per-call options passed through to the backend unchanged.
|
||
*
|
||
* Custom OCR backends and built-in backends that support runtime tuning
|
||
* can read this value and deserialize the keys they care about. Keys
|
||
* unknown to the backend are silently ignored.
|
||
*
|
||
* This is the recommended extension point for per-call parameters that
|
||
* are not covered by the typed fields above (e.g. mode switching,
|
||
* preprocessing flags, inference batch size).
|
||
*
|
||
* **Scope:** when `pipeline` is `None`, this value is propagated to the
|
||
* primary stage of the auto-constructed pipeline. When `pipeline` is
|
||
* explicitly set, this field has **no effect** — the caller must set
|
||
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
|
||
* instead.
|
||
*
|
||
* Example:
|
||
* ```json
|
||
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
|
||
* ```
|
||
*/
|
||
readonly backendOptions?: JsonValue
|
||
/** OCR element extraction configuration */
|
||
readonly elementConfig?: OcrElementConfig
|
||
/**
|
||
* Quality thresholds for the native-text-to-OCR fallback decision.
|
||
* When None, uses compiled defaults (matching previous hardcoded behavior).
|
||
*/
|
||
readonly qualityThresholds?: OcrQualityThresholds
|
||
/**
|
||
* Multi-backend OCR pipeline configuration. When set, enables weighted
|
||
* fallback across multiple OCR backends based on output quality.
|
||
* When None, uses the single `backend` field (same as today).
|
||
*/
|
||
readonly pipeline?: OcrPipelineConfig
|
||
/**
|
||
* Enable automatic page rotation based on orientation detection.
|
||
*
|
||
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
|
||
* page orientation (0/90/180/270 degrees) before OCR. If the page is
|
||
* rotated with high confidence, the image is corrected before recognition.
|
||
* This is critical for handling rotated scanned documents.
|
||
*/
|
||
readonly autoRotate?: boolean
|
||
/**
|
||
* VLM (Vision Language Model) OCR configuration.
|
||
*
|
||
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
|
||
* images to a vision model for text extraction.
|
||
*/
|
||
readonly vlmConfig?: LlmConfig
|
||
/**
|
||
* Custom Jinja2 prompt template for VLM OCR.
|
||
*
|
||
* When `None`, uses the default template. Available variables:
|
||
* - `{{ language }}` — The document language code (e.g., "eng", "deu").
|
||
*/
|
||
readonly vlmPrompt?: string
|
||
/**
|
||
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
||
*
|
||
* Not user-configurable via config files — injected at runtime from
|
||
* `ExtractionConfig::acceleration` before each `process_image` call.
|
||
*/
|
||
readonly acceleration?: AccelerationConfig
|
||
/**
|
||
* Caller-supplied Tesseract `traineddata` bytes per language code.
|
||
*
|
||
* Primary use case is the WASM build, which has no filesystem and cannot
|
||
* download tessdata at runtime. Native builds typically rely on
|
||
* `TessdataManager` and ignore this field. When present, the WASM
|
||
* Tesseract backend prefers these bytes over its compile-time-bundled
|
||
* English data.
|
||
*
|
||
* Skipped by serde to keep config files small — supply via the typed API
|
||
* at runtime.
|
||
*/
|
||
readonly tessdataBytes?: Record<string, Uint8Array>
|
||
}
|
||
|
||
/**
|
||
* A unified OCR element representing detected text with full metadata.
|
||
*
|
||
* This is the primary type for structured OCR output, preserving all information
|
||
* from both Tesseract and PaddleOCR backends.
|
||
*/
|
||
export interface OcrElement {
|
||
/** The recognized text content. */
|
||
readonly text?: string
|
||
/** Bounding geometry (rectangle or quadrilateral). */
|
||
readonly geometry?: OcrBoundingGeometry
|
||
/** Confidence scores for detection and recognition. */
|
||
readonly confidence?: OcrConfidence
|
||
/** Hierarchical level (word, line, block, page). */
|
||
readonly level?: OcrElementLevel
|
||
/** Rotation information (if detected). */
|
||
readonly rotation?: OcrRotation
|
||
/** Page number (1-indexed). */
|
||
readonly pageNumber?: number
|
||
/**
|
||
* Parent element ID for hierarchical relationships.
|
||
*
|
||
* Only used for Tesseract output which has word -> line -> block hierarchy.
|
||
*/
|
||
readonly parentId?: string
|
||
/** Backend-specific metadata that doesn't fit the unified schema. */
|
||
readonly backendMetadata?: Record<string, JsonValue>
|
||
}
|
||
|
||
/**
|
||
* Configuration for OCR element extraction.
|
||
*
|
||
* Controls how OCR elements are extracted and filtered.
|
||
*/
|
||
export interface OcrElementConfig {
|
||
/**
|
||
* Whether to include OCR elements in the extraction result.
|
||
*
|
||
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
|
||
*/
|
||
readonly includeElements?: boolean
|
||
/**
|
||
* Minimum hierarchical level to include.
|
||
*
|
||
* Elements below this level (e.g., words when min_level is Line) will be excluded.
|
||
*/
|
||
readonly minLevel?: OcrElementLevel
|
||
/**
|
||
* Minimum recognition confidence threshold (0.0-1.0).
|
||
*
|
||
* Elements with confidence below this threshold will be filtered out.
|
||
*/
|
||
readonly minConfidence?: number
|
||
/**
|
||
* Whether to build hierarchical relationships between elements.
|
||
*
|
||
* When true, `parent_id` fields will be populated based on spatial containment.
|
||
* Only meaningful for Tesseract output.
|
||
*/
|
||
readonly buildHierarchy?: boolean
|
||
}
|
||
|
||
/**
|
||
* Hierarchical level of an OCR element.
|
||
*
|
||
* Maps to Tesseract's page segmentation hierarchy and provides
|
||
* equivalent semantics for PaddleOCR.
|
||
*/
|
||
export declare enum OcrElementLevel {
|
||
/** Individual word */
|
||
Word = "word",
|
||
/** Line of text (default for PaddleOCR) */
|
||
Line = "line",
|
||
/** Paragraph or text block */
|
||
Block = "block",
|
||
/** Page-level element */
|
||
Page = "page",
|
||
}
|
||
|
||
/**
|
||
* OCR extraction result.
|
||
*
|
||
* Result of performing OCR on an image or scanned document,
|
||
* including recognized text and detected tables.
|
||
*/
|
||
export interface OcrExtractionResult {
|
||
/** Recognized text content */
|
||
readonly content: string
|
||
/** Original MIME type of the processed image */
|
||
readonly mimeType: string
|
||
/** OCR processing metadata (confidence scores, language, etc.) */
|
||
readonly metadata: Record<string, JsonValue>
|
||
/** Tables detected and extracted via OCR */
|
||
readonly tables: Array<OcrTable>
|
||
/**
|
||
* Structured OCR elements with bounding boxes and confidence scores.
|
||
* Available when TSV output is requested or table detection is enabled.
|
||
*/
|
||
readonly ocrElements?: Array<OcrElement>
|
||
/**
|
||
* Structured document produced from hOCR parsing.
|
||
* Carries paragraph structure, bounding boxes, and confidence scores
|
||
* that the flattened `content` string discards.
|
||
*/
|
||
readonly internalDocument?: string
|
||
}
|
||
|
||
/**
|
||
* OCR processing metadata.
|
||
*
|
||
* Captures information about OCR processing configuration and results.
|
||
*/
|
||
export interface OcrMetadata {
|
||
/** OCR language code(s) used */
|
||
readonly language?: string
|
||
/** Tesseract Page Segmentation Mode (PSM) */
|
||
readonly psm?: number
|
||
/** Output format (e.g., "text", "hocr") */
|
||
readonly outputFormat?: string
|
||
/** Number of tables detected */
|
||
readonly tableCount?: number
|
||
readonly tableRows?: number
|
||
readonly tableCols?: number
|
||
}
|
||
|
||
/**
|
||
* Multi-backend OCR pipeline with quality-based fallback.
|
||
*
|
||
* Backends are tried in priority order (highest first). After each backend
|
||
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
|
||
* the result is accepted. Otherwise the next backend is tried.
|
||
*/
|
||
export interface OcrPipelineConfig {
|
||
/** Ordered list of backends to try. Sorted by priority (descending) at runtime. */
|
||
readonly stages: Array<OcrPipelineStage>
|
||
/** Quality thresholds for deciding whether to accept a result or try the next backend. */
|
||
readonly qualityThresholds: OcrQualityThresholds
|
||
}
|
||
|
||
/** A single backend stage in the OCR pipeline. */
|
||
export interface OcrPipelineStage {
|
||
/** Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
|
||
readonly backend: string
|
||
/** Priority weight (higher = tried first). Stages are sorted by priority descending. */
|
||
readonly priority: number
|
||
/** Language override for this stage (None = use parent OcrConfig.language). */
|
||
readonly language?: string
|
||
/** Tesseract-specific config override for this stage. */
|
||
readonly tesseractConfig?: TesseractConfig
|
||
/** PaddleOCR-specific config for this stage. */
|
||
readonly paddleOcrConfig?: JsonValue
|
||
/** VLM config override for this pipeline stage. */
|
||
readonly vlmConfig?: LlmConfig
|
||
/**
|
||
* Arbitrary per-call options passed through to the backend unchanged.
|
||
*
|
||
* Backends that support runtime tuning (mode switching, preprocessing
|
||
* flags, inference parameters, etc.) read this value and deserialize
|
||
* the keys they care about. Keys unknown to the backend are silently
|
||
* ignored, so options from different backends can coexist in the same
|
||
* config without conflict.
|
||
*
|
||
* Example (custom backend):
|
||
* ```json
|
||
* { "mode": "fast", "enable_layout": true }
|
||
* ```
|
||
*/
|
||
readonly backendOptions?: JsonValue
|
||
}
|
||
|
||
/**
|
||
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
|
||
*
|
||
* All fields default to the values that match the previous hardcoded behavior,
|
||
* so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
||
*/
|
||
export interface OcrQualityThresholds {
|
||
/** Minimum total non-whitespace characters to consider text substantive. */
|
||
readonly minTotalNonWhitespace?: number
|
||
/** Minimum non-whitespace characters per page on average. */
|
||
readonly minNonWhitespacePerPage?: number
|
||
/** Minimum character count for a word to be "meaningful". */
|
||
readonly minMeaningfulWordLen?: number
|
||
/** Minimum count of meaningful words before text is accepted. */
|
||
readonly minMeaningfulWords?: number
|
||
/** Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
|
||
readonly minAlnumRatio?: number
|
||
/** Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
|
||
readonly minGarbageChars?: number
|
||
/** Maximum fraction of short (1-2 char) words before text is considered fragmented. */
|
||
readonly maxFragmentedWordRatio?: number
|
||
/**
|
||
* Critical fragmentation threshold — triggers OCR regardless of meaningful words.
|
||
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
|
||
*/
|
||
readonly criticalFragmentedWordRatio?: number
|
||
/** Minimum average word length. Below this with enough words indicates garbled extraction. */
|
||
readonly minAvgWordLength?: number
|
||
/** Minimum word count before average word length check applies. */
|
||
readonly minWordsForAvgLengthCheck?: number
|
||
/** Minimum consecutive word repetition ratio to detect column scrambling. */
|
||
readonly minConsecutiveRepeatRatio?: number
|
||
/** Minimum word count before consecutive repetition check is applied. */
|
||
readonly minWordsForRepeatCheck?: number
|
||
/** Minimum character count for "substantive markdown" OCR skip gate. */
|
||
readonly substantiveMinChars?: number
|
||
/** Minimum character count for "non-text content" OCR skip gate. */
|
||
readonly nonTextMinChars?: number
|
||
/** Alphanumeric+whitespace ratio threshold for skip decisions. */
|
||
readonly alnumWsRatioThreshold?: number
|
||
/**
|
||
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
|
||
* If the result from a backend scores below this, try the next backend.
|
||
*/
|
||
readonly pipelineMinQuality?: number
|
||
}
|
||
|
||
/** Rotation information for an OCR element. */
|
||
export interface OcrRotation {
|
||
/** Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
|
||
readonly angleDegrees: number
|
||
/** Confidence score for the rotation detection. */
|
||
readonly confidence?: number
|
||
}
|
||
|
||
/**
|
||
* Table detected via OCR.
|
||
*
|
||
* Represents a table structure recognized during OCR processing.
|
||
*/
|
||
export interface OcrTable {
|
||
/** Table cells as a 2D vector (rows × columns) */
|
||
readonly cells: Array<Array<string>>
|
||
/** Markdown representation of the table */
|
||
readonly markdown: string
|
||
/** Page number where the table was found (1-indexed) */
|
||
readonly pageNumber: number
|
||
/** Bounding box of the table in pixel coordinates (from OCR word positions). */
|
||
readonly boundingBox?: OcrTableBoundingBox
|
||
}
|
||
|
||
/** Bounding box for an OCR-detected table in pixel coordinates. */
|
||
export interface OcrTableBoundingBox {
|
||
/** Left x-coordinate (pixels) */
|
||
readonly left: number
|
||
/** Top y-coordinate (pixels) */
|
||
readonly top: number
|
||
/** Right x-coordinate (pixels) */
|
||
readonly right: number
|
||
/** Bottom y-coordinate (pixels) */
|
||
readonly bottom: number
|
||
}
|
||
|
||
/** Document orientation detection result. */
|
||
export interface OrientationResult {
|
||
/** Detected orientation in degrees (0, 90, 180, or 270). */
|
||
readonly degrees: number
|
||
/** Confidence score (0.0-1.0). */
|
||
readonly confidence: number
|
||
}
|
||
|
||
/**
|
||
* Output format for extraction results.
|
||
*
|
||
* Controls the format of the `content` field in `ExtractionResult`.
|
||
* When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
||
* `Plain` returns the raw extracted text.
|
||
* `Structured` returns JSON with full OCR element data including bounding
|
||
* boxes and confidence scores.
|
||
*/
|
||
export declare enum OutputFormat {
|
||
/** Plain text content only (default) */
|
||
Plain = "plain",
|
||
/** Markdown format */
|
||
Markdown = "markdown",
|
||
/** Djot markup format */
|
||
Djot = "djot",
|
||
/** HTML format */
|
||
Html = "html",
|
||
/** JSON tree format with heading-driven sections. */
|
||
Json = "json",
|
||
/** Structured JSON format with full OCR element metadata. */
|
||
Structured = "structured",
|
||
/**
|
||
* Custom renderer registered via the RendererRegistry.
|
||
* The string is the renderer name (e.g., "docx", "latex").
|
||
*/
|
||
Custom = "custom",
|
||
}
|
||
|
||
/**
|
||
* Supported languages in PaddleOCR.
|
||
*
|
||
* Maps user-friendly language codes to paddle-ocr-rs language identifiers.
|
||
*/
|
||
export declare enum PaddleLanguage {
|
||
/** English */
|
||
English = "English",
|
||
/** Simplified Chinese */
|
||
Chinese = "Chinese",
|
||
/** Japanese */
|
||
Japanese = "Japanese",
|
||
/** Korean */
|
||
Korean = "Korean",
|
||
/** German */
|
||
German = "German",
|
||
/** French */
|
||
French = "French",
|
||
/** Latin script (covers most European languages) */
|
||
Latin = "Latin",
|
||
/** Cyrillic (Russian and related) */
|
||
Cyrillic = "Cyrillic",
|
||
/** Traditional Chinese */
|
||
TraditionalChinese = "TraditionalChinese",
|
||
/** Thai */
|
||
Thai = "Thai",
|
||
/** Greek */
|
||
Greek = "Greek",
|
||
/** East Slavic (Russian, Ukrainian, Belarusian) */
|
||
EastSlavic = "EastSlavic",
|
||
/** Arabic (Arabic, Persian, Urdu) */
|
||
Arabic = "Arabic",
|
||
/** Devanagari (Hindi, Marathi, Sanskrit, Nepali) */
|
||
Devanagari = "Devanagari",
|
||
/** Tamil */
|
||
Tamil = "Tamil",
|
||
/** Telugu */
|
||
Telugu = "Telugu",
|
||
}
|
||
|
||
/**
|
||
* Configuration for PaddleOCR backend.
|
||
*
|
||
* Configures PaddleOCR text detection and recognition with multi-language support.
|
||
* Uses a builder pattern for convenient configuration.
|
||
*/
|
||
export interface PaddleOcrConfig {
|
||
/** Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
|
||
readonly language?: string
|
||
/** Optional custom cache directory for model files */
|
||
readonly cacheDir?: string
|
||
/**
|
||
* Enable angle classification for rotated text (default: false).
|
||
* Can misfire on short text regions, rotating crops incorrectly before recognition.
|
||
*/
|
||
readonly useAngleCls?: boolean
|
||
/** Enable table structure detection (default: false) */
|
||
readonly enableTableDetection?: boolean
|
||
/**
|
||
* Database threshold for text detection (default: 0.3)
|
||
* Range: 0.0-1.0, higher values require more confident detections
|
||
*/
|
||
readonly detDbThresh?: number
|
||
/**
|
||
* Box threshold for text bounding box refinement (default: 0.5)
|
||
* Range: 0.0-1.0
|
||
*/
|
||
readonly detDbBoxThresh?: number
|
||
/**
|
||
* Unclip ratio for expanding text bounding boxes (default: 1.6)
|
||
* Controls the expansion of detected text regions
|
||
*/
|
||
readonly detDbUnclipRatio?: number
|
||
/**
|
||
* Maximum side length for detection image (default: 960)
|
||
* Larger images may be resized to this limit for faster inference
|
||
*/
|
||
readonly detLimitSideLen?: number
|
||
/**
|
||
* Batch size for recognition inference (default: 6)
|
||
* Number of text regions to process simultaneously
|
||
*/
|
||
readonly recBatchNum?: number
|
||
/**
|
||
* Padding in pixels added around the image before detection (default: 10).
|
||
* Large values can include surrounding content like table gridlines.
|
||
*/
|
||
readonly padding?: number
|
||
/**
|
||
* Minimum recognition confidence score for text lines (default: 0.5).
|
||
* Text regions with recognition confidence below this threshold are discarded.
|
||
* Matches PaddleOCR Python's `drop_score` parameter.
|
||
* Range: 0.0-1.0
|
||
*/
|
||
readonly dropScore?: number
|
||
/**
|
||
* Model tier controlling detection/recognition model size and accuracy trade-off.
|
||
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
|
||
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
|
||
*/
|
||
readonly modelTier?: string
|
||
}
|
||
|
||
/**
|
||
* Byte offset boundary for a page.
|
||
*
|
||
* Tracks where a specific page's content starts and ends in the main content string,
|
||
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
||
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
||
*/
|
||
export interface PageBoundary {
|
||
/** Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
|
||
readonly byteStart: number
|
||
/** Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
|
||
readonly byteEnd: number
|
||
/** Page number (1-indexed) */
|
||
readonly pageNumber: number
|
||
}
|
||
|
||
/**
|
||
* Page extraction and tracking configuration.
|
||
*
|
||
* Controls how pages are extracted, tracked, and represented in the extraction results.
|
||
* When `None`, page tracking is disabled.
|
||
*
|
||
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
||
* when page boundaries are available and chunking is configured.
|
||
*/
|
||
export interface PageConfig {
|
||
/** Extract pages as separate array (ExtractionResult.pages) */
|
||
readonly extractPages?: boolean
|
||
/** Insert page markers in main content string */
|
||
readonly insertPageMarkers?: boolean
|
||
/**
|
||
* Page marker format (use {page_num} placeholder)
|
||
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
||
*/
|
||
readonly markerFormat?: string
|
||
}
|
||
|
||
/**
|
||
* Content for a single page/slide.
|
||
*
|
||
* When page extraction is enabled, documents are split into per-page content
|
||
* with associated tables and images mapped to each page.
|
||
*
|
||
* # Performance
|
||
*
|
||
* Uses Arc-wrapped tables and images for memory efficiency:
|
||
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
||
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
||
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
||
*
|
||
* This reduces memory overhead for documents with shared tables/images
|
||
* by avoiding redundant copies during serialization.
|
||
*/
|
||
export interface PageContent {
|
||
/** Page number (1-indexed) */
|
||
readonly pageNumber: number
|
||
/** Text content for this page */
|
||
readonly content: string
|
||
/**
|
||
* Tables found on this page (uses Arc for memory efficiency)
|
||
*
|
||
* Serializes as Vec<Table> for JSON compatibility while maintaining
|
||
* Arc semantics in-memory for zero-copy sharing.
|
||
*/
|
||
readonly tables: Array<Table>
|
||
/**
|
||
* Indices into `ExtractionResult.images` for images found on this page.
|
||
*
|
||
* Each value is a zero-based index into the top-level `images` collection.
|
||
* Only populated when `extract_images = true` in the extraction config.
|
||
*/
|
||
readonly imageIndices: Array<number>
|
||
/**
|
||
* Hierarchy information for the page (when hierarchy extraction is enabled)
|
||
*
|
||
* Contains text hierarchy levels (H1-H6) extracted from the page content.
|
||
*/
|
||
readonly hierarchy?: PageHierarchy
|
||
/**
|
||
* Whether this page is blank (no meaningful text content)
|
||
*
|
||
* Determined during extraction based on text content analysis.
|
||
* A page is blank if it has fewer than 3 non-whitespace characters
|
||
* and contains no tables or images.
|
||
*/
|
||
readonly isBlank?: boolean
|
||
/**
|
||
* Layout detection regions for this page (when layout detection is enabled).
|
||
*
|
||
* Contains detected layout regions with class, confidence, bounding box,
|
||
* and area fraction. Only populated when layout detection is configured.
|
||
*/
|
||
readonly layoutRegions?: Array<LayoutRegion>
|
||
/**
|
||
* Speaker notes for this slide (PPTX only).
|
||
*
|
||
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
|
||
* Only populated when the source is a PPTX file and notes are present.
|
||
*/
|
||
readonly speakerNotes?: string
|
||
/**
|
||
* Section name this slide belongs to (PPTX only).
|
||
*
|
||
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
|
||
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
|
||
* the slide belongs to a named section.
|
||
*/
|
||
readonly sectionName?: string
|
||
/**
|
||
* Sheet name for this page (XLSX/ODS only).
|
||
*
|
||
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
|
||
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
|
||
* formats and for sheets with an empty name.
|
||
*/
|
||
readonly sheetName?: string
|
||
}
|
||
|
||
/**
|
||
* Page hierarchy structure containing heading levels and block information.
|
||
*
|
||
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
||
* blocks with heading levels (H1-H6) for semantic document structure.
|
||
*/
|
||
export interface PageHierarchy {
|
||
/** Number of hierarchy blocks on this page */
|
||
readonly blockCount: number
|
||
/** Hierarchical blocks with heading levels */
|
||
readonly blocks: Array<HierarchicalBlock>
|
||
}
|
||
|
||
/**
|
||
* Metadata for individual page/slide/sheet.
|
||
*
|
||
* Captures per-page information including dimensions, content counts,
|
||
* and visibility state (for presentations).
|
||
*/
|
||
export interface PageInfo {
|
||
/** Page number (1-indexed) */
|
||
readonly number: number
|
||
/** Page title (usually for presentations) */
|
||
readonly title?: string
|
||
/** Dimensions in points (PDF) or pixels (images): (width, height) */
|
||
readonly dimensions?: Array<number>
|
||
/** Number of images on this page */
|
||
readonly imageCount?: number
|
||
/** Number of tables on this page */
|
||
readonly tableCount?: number
|
||
/** Whether this page is hidden (e.g., in presentations) */
|
||
readonly hidden?: boolean
|
||
/**
|
||
* Whether this page is blank (no meaningful text, no images, no tables)
|
||
*
|
||
* A page is considered blank if it has fewer than 3 non-whitespace characters
|
||
* and contains no tables or images. This is useful for filtering out empty pages
|
||
* in scanned documents or PDFs with blank separator pages.
|
||
*/
|
||
readonly isBlank?: boolean
|
||
/**
|
||
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
||
*
|
||
* Indicates the presence of vector-drawn content such as charts, diagrams,
|
||
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
|
||
* invisible to `ExtractionResult.images` since they are not embedded as raster
|
||
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
|
||
* signaling that downstream consumers may want to rasterize the page to
|
||
* capture this content.
|
||
*
|
||
* Only populated for PDFs; `None` for other document types.
|
||
*/
|
||
readonly hasVectorGraphics: boolean
|
||
}
|
||
|
||
/**
|
||
* Unified page structure for documents.
|
||
*
|
||
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
||
* with character offset boundaries for chunk-to-page mapping.
|
||
*/
|
||
export interface PageStructure {
|
||
/** Total number of pages/slides/sheets */
|
||
readonly totalCount: number
|
||
/** Type of paginated unit */
|
||
readonly unitType: PageUnitType
|
||
/**
|
||
* Character offset boundaries for each page
|
||
*
|
||
* Maps character ranges in the extracted content to page numbers.
|
||
* Used for chunk page range calculation.
|
||
*/
|
||
readonly boundaries?: Array<PageBoundary>
|
||
/** Detailed per-page metadata (optional, only when needed) */
|
||
readonly pages?: Array<PageInfo>
|
||
}
|
||
|
||
/**
|
||
* Type of paginated unit in a document.
|
||
*
|
||
* Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
||
*/
|
||
export declare enum PageUnitType {
|
||
/** Standard document pages (PDF, DOCX, images) */
|
||
Page = "page",
|
||
/** Presentation slides (PPTX, ODP) */
|
||
Slide = "slide",
|
||
/** Spreadsheet sheets (XLSX, ODS) */
|
||
Sheet = "sheet",
|
||
}
|
||
|
||
/** A PDF annotation extracted from a document page. */
|
||
export interface PdfAnnotation {
|
||
/** The type of annotation. */
|
||
readonly annotationType: PdfAnnotationType
|
||
/** Text content of the annotation (e.g., comment text, link URL). */
|
||
readonly content?: string
|
||
/** Page number where the annotation appears (1-indexed). */
|
||
readonly pageNumber: number
|
||
/** Bounding box of the annotation on the page. */
|
||
readonly boundingBox?: BoundingBox
|
||
}
|
||
|
||
/** Type of PDF annotation. */
|
||
export declare enum PdfAnnotationType {
|
||
/** Sticky note / text annotation */
|
||
Text = "text",
|
||
/** Highlighted text region */
|
||
Highlight = "highlight",
|
||
/** Hyperlink annotation */
|
||
Link = "link",
|
||
/** Rubber stamp annotation */
|
||
Stamp = "stamp",
|
||
/** Underline text markup */
|
||
Underline = "underline",
|
||
/** Strikeout text markup */
|
||
StrikeOut = "strike_out",
|
||
/** Any other annotation type */
|
||
Other = "other",
|
||
}
|
||
|
||
/** PDF-specific configuration. */
|
||
export interface PdfConfig {
|
||
/** Extract images from PDF */
|
||
readonly extractImages?: boolean
|
||
/**
|
||
* Extract tables from PDF.
|
||
*
|
||
* When `true` (default), runs pdf_oxide's native grid detector and, if it
|
||
* finds nothing, falls back to the heuristic text-layer reconstruction in
|
||
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
|
||
* both passes — `tables` will then be empty in the result.
|
||
*/
|
||
readonly extractTables?: boolean
|
||
/** List of passwords to try when opening encrypted PDFs */
|
||
readonly passwords?: Array<string>
|
||
/** Extract PDF metadata */
|
||
readonly extractMetadata?: boolean
|
||
/** Hierarchy extraction configuration (None = hierarchy extraction disabled) */
|
||
readonly hierarchy?: HierarchyConfig
|
||
/**
|
||
* Extract PDF annotations (text notes, highlights, links, stamps).
|
||
* Default: false
|
||
*/
|
||
readonly extractAnnotations?: boolean
|
||
/**
|
||
* Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
||
* Default: 0.06 (6%)
|
||
*/
|
||
readonly topMarginFraction?: number
|
||
/**
|
||
* Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
||
* Default: 0.05 (5%)
|
||
*/
|
||
readonly bottomMarginFraction?: number
|
||
/**
|
||
* Allow single-column pseudo tables in extraction results.
|
||
*
|
||
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
||
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
|
||
* to 1, allowing single-column structured data (glossaries, itemized lists)
|
||
* to be emitted as tables. Other quality filters (density, sparsity, prose
|
||
* detection) still apply.
|
||
*/
|
||
readonly allowSingleColumnTables?: boolean
|
||
/**
|
||
* Perform OCR on inline images extracted from PDF pages and attach the
|
||
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
|
||
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
|
||
* falls back to `TesseractConfig::default()`. Per-image failures degrade
|
||
* gracefully (the image is returned without OCR text rather than failing
|
||
* the whole extraction). Default: `false`.
|
||
*/
|
||
readonly ocrInlineImages?: boolean
|
||
}
|
||
|
||
/**
|
||
* PDF-specific metadata.
|
||
*
|
||
* Contains metadata fields specific to PDF documents that are not in the common
|
||
* `Metadata` structure. Common fields like title, authors, keywords, and dates
|
||
* are at the `Metadata` level.
|
||
*/
|
||
export interface PdfMetadata {
|
||
/** PDF version (e.g., "1.7", "2.0") */
|
||
readonly pdfVersion?: string
|
||
/** PDF producer (application that created the PDF) */
|
||
readonly producer?: string
|
||
/** Whether the PDF is encrypted/password-protected */
|
||
readonly isEncrypted?: boolean
|
||
/** First page width in points (1/72 inch) */
|
||
readonly width?: number
|
||
/** First page height in points (1/72 inch) */
|
||
readonly height?: number
|
||
/** Total number of pages in the PDF document */
|
||
readonly pageCount?: number
|
||
}
|
||
|
||
/**
|
||
* Base trait that all plugins must implement.
|
||
*
|
||
* This trait provides common functionality for plugin lifecycle management,
|
||
* identification, and metadata.
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* All plugins must be `Send + Sync` to support concurrent usage across threads.
|
||
*/
|
||
export interface Plugin {
|
||
/**
|
||
* Returns the unique name/identifier for this plugin.
|
||
*
|
||
* The name should be:
|
||
* - Unique across all plugins
|
||
* - Lowercase with hyphens (e.g., "my-custom-plugin")
|
||
* - URL-safe characters only
|
||
*/
|
||
name(): string
|
||
/**
|
||
* Returns the semantic version of this plugin.
|
||
*
|
||
* Should follow semver format: `MAJOR.MINOR.PATCH`
|
||
*/
|
||
version?(): string
|
||
/**
|
||
* Initialize the plugin.
|
||
*
|
||
* Called once when the plugin is registered. Use this to:
|
||
* - Load configuration
|
||
* - Initialize resources (connections, caches, etc.)
|
||
* - Validate dependencies
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
|
||
* Plugins needing mutable state during initialization should use interior mutability
|
||
* patterns (Mutex, RwLock, OnceCell, etc.).
|
||
* @throws Should return an error if initialization fails. The plugin will not be
|
||
* registered if this method returns an error.
|
||
*/
|
||
initialize?(): void
|
||
/**
|
||
* Shutdown the plugin.
|
||
*
|
||
* Called when the plugin is being unregistered or the application is shutting down.
|
||
* Use this to:
|
||
* - Close connections
|
||
* - Flush caches
|
||
* - Release resources
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* This method takes `&self` instead of `&mut self` to work with `Arc<dyn Plugin>`.
|
||
* Plugins needing mutable state during shutdown should use interior mutability
|
||
* patterns (Mutex, RwLock, etc.).
|
||
* @throws Errors during shutdown are logged but don't prevent the shutdown process.
|
||
*/
|
||
shutdown?(): void
|
||
/**
|
||
* Optional plugin description for debugging and logging.
|
||
*
|
||
* Defaults to empty string if not overridden.
|
||
*/
|
||
description?(): string
|
||
/**
|
||
* Optional plugin author information.
|
||
*
|
||
* Defaults to empty string if not overridden.
|
||
*/
|
||
author?(): string
|
||
}
|
||
|
||
/**
|
||
* Trait for post-processor plugins.
|
||
*
|
||
* Post-processors transform or enrich extraction results after the initial
|
||
* extraction is complete. They can:
|
||
* - Clean and normalize text
|
||
* - Add metadata (language, keywords, entities)
|
||
* - Split content into chunks
|
||
* - Score quality
|
||
* - Apply custom transformations
|
||
*
|
||
* # Processing Order
|
||
*
|
||
* Post-processors are executed in stage order:
|
||
* 1. **Early** - Language detection, entity extraction
|
||
* 2. **Middle** - Keyword extraction, token reduction
|
||
* 3. **Late** - Custom hooks, final validation
|
||
*
|
||
* Within each stage, processors are executed in registration order.
|
||
*
|
||
* # Error Handling
|
||
*
|
||
* Post-processor errors are non-fatal by default - they're captured in metadata
|
||
* and execution continues. To make errors fatal, return an error from `process()`.
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* Post-processors must be thread-safe (`Send + Sync`).
|
||
*/
|
||
export interface PostProcessor {
|
||
name(): string
|
||
/**
|
||
* Process an extraction result.
|
||
*
|
||
* Transform or enrich the extraction result. Can modify:
|
||
* - `content` - The extracted text
|
||
* - `metadata` - Add or update metadata fields
|
||
* - `tables` - Modify or enhance table data
|
||
* @param result - Mutable reference to the extraction result to process
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns `Ok(())` if processing succeeded, `Err(...)` for fatal failures.
|
||
*
|
||
* @throws Return errors for fatal processing failures. Non-fatal errors should be
|
||
* captured in metadata directly on the result.
|
||
*
|
||
* # Performance
|
||
*
|
||
* This signature avoids unnecessary cloning of large extraction results by
|
||
* taking a mutable reference instead of ownership. Processors modify the
|
||
* result in place.
|
||
*
|
||
* # Example - Language Detection
|
||
*
|
||
* ```rust
|
||
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||
* -> Result<()> {
|
||
* // Detect language (simplified - use real detection library in practice)
|
||
* let language = "en"; // Placeholder detection
|
||
*
|
||
* // Add to metadata
|
||
* result.metadata.additional.insert("detected_language".to_string().into(), serde_json::json!(language));
|
||
*
|
||
* Ok(())
|
||
* }
|
||
* ```
|
||
*
|
||
* # Example - Text Cleaning
|
||
*
|
||
* ```rust
|
||
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
||
* -> Result<()> {
|
||
* // Remove excessive whitespace
|
||
* result.content = result
|
||
* .content
|
||
* .split_whitespace()
|
||
* .collect::<Vec<_>>()
|
||
* .join(" ");
|
||
*
|
||
* Ok(())
|
||
* }
|
||
* ```
|
||
*/
|
||
process(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
|
||
/**
|
||
* Get the processing stage for this post-processor.
|
||
*
|
||
* Determines when this processor runs in the pipeline.
|
||
* @returns The `ProcessingStage` (Early, Middle, or Late).
|
||
*/
|
||
processingStage(): string
|
||
/**
|
||
* Optional: Check if this processor should run for a given result.
|
||
*
|
||
* Allows conditional processing based on MIME type, metadata, or content.
|
||
* Defaults to `true` (always run).
|
||
* @param result - The extraction result to check
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns `true` if the processor should run, `false` to skip.
|
||
*/
|
||
shouldProcess?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
|
||
/**
|
||
* Optional: Estimate processing time in milliseconds.
|
||
*
|
||
* Used for logging and debugging. Defaults to 0 (unknown).
|
||
* @param result - The extraction result to estimate for
|
||
*
|
||
* @returns Estimated processing time in milliseconds.
|
||
*/
|
||
estimatedDurationMs?(result?: ExtractionResult | undefined | null): string
|
||
/**
|
||
* Execution priority within the processing stage.
|
||
*
|
||
* Higher values run first within the same `ProcessingStage`. Defaults to 50.
|
||
* Use 0-49 for fallback processors, 50 for normal processors, and 51-255
|
||
* for high-priority processors that should run early in their stage.
|
||
*/
|
||
priority?(): string
|
||
}
|
||
|
||
/** Post-processor configuration. */
|
||
export interface PostProcessorConfig {
|
||
/** Enable post-processors */
|
||
readonly enabled?: boolean
|
||
/** Whitelist of processor names to run (None = all enabled) */
|
||
readonly enabledProcessors?: Array<string>
|
||
/** Blacklist of processor names to skip (None = none disabled) */
|
||
readonly disabledProcessors?: Array<string>
|
||
/** Pre-computed AHashSet for O(1) enabled processor lookup */
|
||
readonly enabledSet?: Array<string>
|
||
/** Pre-computed AHashSet for O(1) disabled processor lookup */
|
||
readonly disabledSet?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* Application properties from docProps/app.xml for PPTX
|
||
*
|
||
* Contains PowerPoint-specific document metadata.
|
||
*/
|
||
export interface PptxAppProperties {
|
||
/** Application name (e.g., "Microsoft Office PowerPoint") */
|
||
readonly application?: string
|
||
/** Application version */
|
||
readonly appVersion?: string
|
||
/** Total editing time in minutes */
|
||
readonly totalTime?: number
|
||
/** Company name */
|
||
readonly company?: string
|
||
/** Document security level */
|
||
readonly docSecurity?: number
|
||
/** Scale crop flag */
|
||
readonly scaleCrop?: boolean
|
||
/** Links up to date flag */
|
||
readonly linksUpToDate?: boolean
|
||
/** Shared document flag */
|
||
readonly sharedDoc?: boolean
|
||
/** Hyperlinks changed flag */
|
||
readonly hyperlinksChanged?: boolean
|
||
/** Number of slides */
|
||
readonly slides?: number
|
||
/** Number of notes */
|
||
readonly notes?: number
|
||
/** Number of hidden slides */
|
||
readonly hiddenSlides?: number
|
||
/** Number of multimedia clips */
|
||
readonly multimediaClips?: number
|
||
/** Presentation format (e.g., "Widescreen", "Standard") */
|
||
readonly presentationFormat?: string
|
||
/** Slide titles */
|
||
readonly slideTitles?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* PowerPoint (PPTX) extraction result.
|
||
*
|
||
* Contains extracted slide content, metadata, and embedded images/tables.
|
||
*/
|
||
export interface PptxExtractionResult {
|
||
/** Extracted text content from all slides */
|
||
readonly content: string
|
||
/** Presentation metadata */
|
||
readonly metadata: PptxMetadata
|
||
/** Total number of slides */
|
||
readonly slideCount: number
|
||
/** Total number of embedded images */
|
||
readonly imageCount: number
|
||
/** Total number of tables */
|
||
readonly tableCount: number
|
||
/** Extracted images from the presentation */
|
||
readonly images: Array<ExtractedImage>
|
||
/** Slide structure with boundaries (when page tracking is enabled) */
|
||
readonly pageStructure?: PageStructure
|
||
/** Per-slide content (when page tracking is enabled) */
|
||
readonly pageContents?: Array<PageContent>
|
||
/** Structured document representation */
|
||
readonly document?: DocumentStructure
|
||
/** Hyperlinks discovered in slides as (url, optional_label) pairs. */
|
||
readonly hyperlinks: Array<string>
|
||
/**
|
||
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
|
||
*
|
||
* Contains keys like "title", "author", "created_by", "subject", "keywords",
|
||
* "modified_by", "created_at", "modified_at", etc.
|
||
*/
|
||
readonly officeMetadata: Record<string, string>
|
||
/**
|
||
* Slide comments as revisions.
|
||
*
|
||
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
|
||
* `DocumentRevision { kind: Comment }` with author (resolved from
|
||
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
|
||
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
|
||
*/
|
||
readonly revisions?: Array<DocumentRevision>
|
||
}
|
||
|
||
/**
|
||
* PowerPoint presentation metadata.
|
||
*
|
||
* Extracted from PPTX files containing slide counts and presentation details.
|
||
*/
|
||
export interface PptxMetadata {
|
||
/** Total number of slides in the presentation */
|
||
readonly slideCount?: number
|
||
/** Names of slides (if available) */
|
||
readonly slideNames?: Array<string>
|
||
/** Number of embedded images */
|
||
readonly imageCount?: number
|
||
/** Number of tables */
|
||
readonly tableCount?: number
|
||
}
|
||
|
||
/**
|
||
* Processing stages for post-processors.
|
||
*
|
||
* Post-processors are executed in stage order (Early → Middle → Late).
|
||
* Use stages to control the order of post-processing operations.
|
||
*/
|
||
export declare enum ProcessingStage {
|
||
/**
|
||
* Early stage - foundational processing.
|
||
*
|
||
* Use for:
|
||
* - Language detection
|
||
* - Character encoding normalization
|
||
* - Entity extraction (NER)
|
||
* - Text quality scoring
|
||
*/
|
||
Early = "Early",
|
||
/**
|
||
* Middle stage - content transformation.
|
||
*
|
||
* Use for:
|
||
* - Keyword extraction
|
||
* - Token reduction
|
||
* - Text summarization
|
||
* - Semantic analysis
|
||
*/
|
||
Middle = "Middle",
|
||
/**
|
||
* Late stage - final enrichment.
|
||
*
|
||
* Use for:
|
||
* - Custom user hooks
|
||
* - Analytics/logging
|
||
* - Final validation
|
||
* - Output formatting
|
||
*/
|
||
Late = "Late",
|
||
}
|
||
|
||
/**
|
||
* A non-fatal warning from a processing pipeline stage.
|
||
*
|
||
* Captures errors from optional features that don't prevent extraction
|
||
* but may indicate degraded results.
|
||
*/
|
||
export interface ProcessingWarning {
|
||
/**
|
||
* The pipeline stage or feature that produced this warning
|
||
* (e.g., "embedding", "chunking", "language_detection", "output_format").
|
||
*/
|
||
readonly source: string
|
||
/** Human-readable description of what went wrong. */
|
||
readonly message: string
|
||
}
|
||
|
||
/** Page Segmentation Mode for Tesseract OCR */
|
||
export declare enum PSMMode {
|
||
OsdOnly = "OsdOnly",
|
||
AutoOsd = "AutoOsd",
|
||
AutoOnly = "AutoOnly",
|
||
Auto = "Auto",
|
||
SingleColumn = "SingleColumn",
|
||
SingleBlockVertical = "SingleBlockVertical",
|
||
SingleBlock = "SingleBlock",
|
||
SingleLine = "SingleLine",
|
||
SingleWord = "SingleWord",
|
||
CircleWord = "CircleWord",
|
||
SingleChar = "SingleChar",
|
||
}
|
||
|
||
/** Outlook PST archive metadata. */
|
||
export interface PstMetadata {
|
||
readonly messageCount?: number
|
||
}
|
||
|
||
/** RAKE-specific parameters. */
|
||
export interface RakeParams {
|
||
/** Minimum word length to consider (default: 1). */
|
||
readonly minWordLength?: number
|
||
/** Maximum words in a keyword phrase (default: 3). */
|
||
readonly maxWordsPerPhrase?: number
|
||
}
|
||
|
||
/**
|
||
* Pre-computed table markdown for a table detection region.
|
||
*
|
||
* Produced by the TATR-based table structure recognizer and surfaced as part of
|
||
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
|
||
* so that consumers who do not enable `layout-detection` (ORT) can still reference
|
||
* the type in their own code.
|
||
*/
|
||
export interface RecognizedTable {
|
||
/** Detection bbox that this table corresponds to (for matching). */
|
||
readonly detectionBbox: BBox
|
||
/** Table cells as a 2D vector (rows × columns). */
|
||
readonly cells: Array<Array<string>>
|
||
/** Rendered markdown table. */
|
||
readonly markdown: string
|
||
}
|
||
|
||
export declare enum ReductionLevel {
|
||
Off = "Off",
|
||
Light = "Light",
|
||
Moderate = "Moderate",
|
||
Aggressive = "Aggressive",
|
||
Maximum = "Maximum",
|
||
}
|
||
|
||
/** Semantic kind of a relationship between document elements. */
|
||
export declare enum RelationshipKind {
|
||
/** Footnote marker -> footnote definition. */
|
||
FootnoteReference = "footnote_reference",
|
||
/** Citation marker -> bibliography entry. */
|
||
CitationReference = "citation_reference",
|
||
/** Internal anchor link (`#id`) -> target heading/element. */
|
||
InternalLink = "internal_link",
|
||
/** Caption paragraph -> figure/table it describes. */
|
||
Caption = "caption",
|
||
/** Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`). */
|
||
Label = "label",
|
||
/** TOC entry -> target section. */
|
||
TocEntry = "toc_entry",
|
||
/** Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field). */
|
||
CrossReference = "cross_reference",
|
||
}
|
||
|
||
/**
|
||
* Trait for document renderers that convert [`InternalDocument`] to output strings.
|
||
*
|
||
* Renderers are typically stateless converters that transform the internal
|
||
* document representation into a specific output format (Markdown, HTML,
|
||
* Djot, plain text, etc.). They participate in the standard [`Plugin`]
|
||
* lifecycle so custom renderers can be registered from any supported binding
|
||
* language.
|
||
*
|
||
* The format name is exposed via [`Plugin::name`]. For stateless renderers
|
||
* the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
|
||
* take no-op defaults and need not be overridden.
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* Renderers must be `Send + Sync` (inherited from [`Plugin`]).
|
||
*/
|
||
export interface Renderer {
|
||
name(): string
|
||
/**
|
||
* Render an [`InternalDocument`] to the output format.
|
||
* @param doc - The internal document to render
|
||
*
|
||
* @returns The rendered output as a string.
|
||
*
|
||
* @throws Returns an error if rendering fails.
|
||
*/
|
||
render(doc: InternalDocument): string
|
||
}
|
||
|
||
/**
|
||
* Result-shape selection for extraction results.
|
||
*
|
||
* Distinct from `OutputFormat` (which controls rendering — Plain, Markdown,
|
||
* HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
|
||
* blob vs. an element-based decomposition.
|
||
*/
|
||
export declare enum ResultFormat {
|
||
/** Unified format with all content in `content` field */
|
||
Unified = "unified",
|
||
/** Element-based format with semantic element extraction */
|
||
ElementBased = "element_based",
|
||
}
|
||
|
||
/** Best-effort document location for a revision. */
|
||
export type RevisionAnchor =
|
||
| { type: 'paragraph'; index: number }
|
||
| { type: 'table_cell'; row: number; col: number; tableIndex: number }
|
||
| { type: 'page'; index: number }
|
||
| { type: 'slide'; index: number }
|
||
| { type: 'sheet'; index: number; name: string }
|
||
|
||
/**
|
||
* The content changes that make up a single revision.
|
||
*
|
||
* For insertions and deletions the `content` field carries the added/removed
|
||
* lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
|
||
* changes, `content` is empty — the property diff is left as a TODO for a
|
||
* later enrichment pass.
|
||
*/
|
||
export interface RevisionDelta {
|
||
/** Line-level content changes for this revision. */
|
||
readonly content?: Array<DiffLine>
|
||
/** Cell-level table changes for this revision. */
|
||
readonly tableChanges?: Array<CellChange>
|
||
}
|
||
|
||
/** Semantic classification of a tracked change. */
|
||
export declare enum RevisionKind {
|
||
/** Text or content was inserted. */
|
||
Insertion = "insertion",
|
||
/** Text or content was deleted. */
|
||
Deletion = "deletion",
|
||
/** Run-level formatting (font, size, colour, …) was changed. */
|
||
FormatChange = "format_change",
|
||
/** A reviewer comment or annotation. */
|
||
Comment = "comment",
|
||
}
|
||
|
||
/**
|
||
* Configuration for security limits across extractors.
|
||
*
|
||
* All limits are intentionally conservative to prevent DoS attacks
|
||
* while still supporting legitimate documents.
|
||
*/
|
||
export interface SecurityLimits {
|
||
/** Maximum uncompressed size for archives (500 MB) */
|
||
readonly maxArchiveSize?: number
|
||
/** Maximum compression ratio before flagging as potential bomb (100:1) */
|
||
readonly maxCompressionRatio?: number
|
||
/** Maximum number of files in archive (10,000) */
|
||
readonly maxFilesInArchive?: number
|
||
/** Maximum nesting depth for structures (100) */
|
||
readonly maxNestingDepth?: number
|
||
/**
|
||
* Maximum length of any single XML entity / attribute / token (1 MiB).
|
||
* This is a per-token cap, NOT a total cap — billion-laughs class
|
||
* attacks where a single entity expands to hundreds of MB are caught
|
||
* here, while normal long text content (a paragraph, a CDATA block) is
|
||
* caught by `max_content_size` instead.
|
||
*/
|
||
readonly maxEntityLength?: number
|
||
/** Maximum string growth per document (100 MB) */
|
||
readonly maxContentSize?: number
|
||
/** Maximum iterations per operation */
|
||
readonly maxIterations?: number
|
||
/** Maximum XML depth (100 levels) */
|
||
readonly maxXmlDepth?: number
|
||
/** Maximum cells per table (100,000) */
|
||
readonly maxTableCells?: number
|
||
}
|
||
|
||
/**
|
||
* API server configuration.
|
||
*
|
||
* This struct holds all configuration options for the Kreuzberg API server,
|
||
* including host/port settings, CORS configuration, and upload limits.
|
||
*
|
||
* # Defaults
|
||
*
|
||
* - `host`: "127.0.0.1" (localhost only)
|
||
* - `port`: 8000
|
||
* - `cors_origins`: empty vector (allows all origins)
|
||
* - `max_request_body_bytes`: 104_857_600 (100 MB)
|
||
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
||
*/
|
||
export interface ServerConfig {
|
||
/** Server host address (e.g., "127.0.0.1", "0.0.0.0") */
|
||
readonly host?: string
|
||
/** Server port number */
|
||
readonly port?: number
|
||
/**
|
||
* CORS allowed origins. Empty vector means allow all origins.
|
||
*
|
||
* If this is an empty vector, the server will accept requests from any origin.
|
||
* If populated with specific origins (e.g., `"https://example.com"`), only
|
||
* those origins will be allowed.
|
||
*/
|
||
readonly corsOrigins?: Array<string>
|
||
/** Maximum size of request body in bytes (default: 100 MB) */
|
||
readonly maxRequestBodyBytes?: number
|
||
/** Maximum size of multipart fields in bytes (default: 100 MB) */
|
||
readonly maxMultipartFieldBytes?: number
|
||
}
|
||
|
||
/** Structured data (Schema.org, microdata, RDFa) block. */
|
||
export interface StructuredData {
|
||
/** Type of structured data */
|
||
readonly dataType: StructuredDataType
|
||
/** Raw JSON string representation */
|
||
readonly rawJson: string
|
||
/** Schema type if detectable (e.g., "Article", "Event", "Product") */
|
||
readonly schemaType?: string
|
||
}
|
||
|
||
export interface StructuredDataResult {
|
||
readonly content: string
|
||
readonly format: string
|
||
readonly metadata: Record<string, string>
|
||
readonly textFields: Array<string>
|
||
}
|
||
|
||
/** Structured data type classification. */
|
||
export declare enum StructuredDataType {
|
||
/** JSON-LD structured data */
|
||
JsonLd = "json-ld",
|
||
/** Microdata */
|
||
Microdata = "microdata",
|
||
/** RDFa */
|
||
RDFa = "rdfa",
|
||
}
|
||
|
||
/**
|
||
* Configuration for LLM-based structured data extraction.
|
||
*
|
||
* Sends extracted document content to a VLM with a JSON schema,
|
||
* returning structured data that conforms to the schema.
|
||
* @example
|
||
* ```typescript
|
||
* [structured_extraction]
|
||
* schema_name = "invoice_data"
|
||
* strict = true
|
||
*
|
||
* [structured_extraction.schema]
|
||
* type = "object"
|
||
* properties.vendor = { type = "string" }
|
||
* properties.total = { type = "number" }
|
||
* required = ["vendor", "total"]
|
||
*
|
||
* [structured_extraction.llm]
|
||
* model = "openai/gpt-4o"
|
||
* ```typescript
|
||
*/
|
||
export interface StructuredExtractionConfig {
|
||
/** JSON Schema defining the desired output structure. */
|
||
readonly schema: JsonValue
|
||
/** Schema name passed to the LLM's structured output mode. */
|
||
readonly schemaName: string
|
||
/** Optional schema description for the LLM. */
|
||
readonly schemaDescription?: string
|
||
/** Enable strict mode — output must exactly match the schema. */
|
||
readonly strict: boolean
|
||
/**
|
||
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
|
||
*
|
||
* Available template variables:
|
||
* - `{{ content }}` — The extracted document text.
|
||
* - `{{ schema }}` — The JSON schema as a formatted string.
|
||
* - `{{ schema_name }}` — The schema name.
|
||
* - `{{ schema_description }}` — The schema description (may be empty).
|
||
*/
|
||
readonly prompt?: string
|
||
/** LLM configuration for the extraction. */
|
||
readonly llm: LlmConfig
|
||
}
|
||
|
||
/**
|
||
* A supported document format entry.
|
||
*
|
||
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
||
*/
|
||
export interface SupportedFormat {
|
||
/** File extension (without leading dot), e.g., "pdf", "docx" */
|
||
readonly extension: string
|
||
/** MIME type string, e.g., "application/pdf" */
|
||
readonly mimeType: string
|
||
}
|
||
|
||
/**
|
||
* Extracted table structure.
|
||
*
|
||
* Represents a table detected and extracted from a document (PDF, image, etc.).
|
||
* Tables are converted to both structured cell data and Markdown format.
|
||
*/
|
||
export interface Table {
|
||
/** Table cells as a 2D vector (rows × columns) */
|
||
readonly cells?: Array<Array<string>>
|
||
/** Markdown representation of the table */
|
||
readonly markdown?: string
|
||
/** Page number where the table was found (1-indexed) */
|
||
readonly pageNumber?: number
|
||
/**
|
||
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
||
* Only populated for PDF-extracted tables when position data is available.
|
||
*/
|
||
readonly boundingBox?: BoundingBox
|
||
}
|
||
|
||
/**
|
||
* Individual table cell with content and optional styling.
|
||
*
|
||
* Future extension point for rich table support with cell-level metadata.
|
||
*/
|
||
export interface TableCell {
|
||
/** Cell content as text */
|
||
readonly content?: string
|
||
/** Row span (number of rows this cell spans) */
|
||
readonly rowSpan?: number
|
||
/** Column span (number of columns this cell spans) */
|
||
readonly colSpan?: number
|
||
/** Whether this is a header cell */
|
||
readonly isHeader?: boolean
|
||
}
|
||
|
||
/** Cell-level changes for a pair of tables that share the same index. */
|
||
export interface TableDiff {
|
||
/** Zero-based index of the table in both `a.tables` and `b.tables`. */
|
||
readonly fromIndex: number
|
||
/** Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
|
||
readonly toIndex: number
|
||
/** Cell-level changes within the table. */
|
||
readonly cellChanges: Array<CellChange>
|
||
}
|
||
|
||
/**
|
||
* Structured table grid with cell-level metadata.
|
||
*
|
||
* Stores row/column dimensions and a flat list of cells with position info.
|
||
*/
|
||
export interface TableGrid {
|
||
/** Number of rows in the table. */
|
||
readonly rows?: number
|
||
/** Number of columns in the table. */
|
||
readonly cols?: number
|
||
/** All cells in row-major order. */
|
||
readonly cells?: Array<GridCell>
|
||
}
|
||
|
||
/**
|
||
* Which table structure recognition model to use.
|
||
*
|
||
* Controls the model used for table cell detection within layout-detected
|
||
* table regions. Wire format is snake_case in all serializers (JSON, TOML,
|
||
* YAML).
|
||
*/
|
||
export declare enum TableModel {
|
||
/** TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection. */
|
||
Tatr = "tatr",
|
||
/** SLANeXT wired variant -- 365MB, optimized for bordered tables. */
|
||
SlanetWired = "slanet_wired",
|
||
/** SLANeXT wireless variant -- 365MB, optimized for borderless tables. */
|
||
SlanetWireless = "slanet_wireless",
|
||
/** SLANet-plus -- 7.78MB, lightweight general-purpose. */
|
||
SlanetPlus = "slanet_plus",
|
||
/**
|
||
* Classifier-routed SLANeXT: auto-select wired/wireless per table.
|
||
* Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
|
||
*/
|
||
SlanetAuto = "slanet_auto",
|
||
/** Disable table structure model inference entirely; use heuristic path only. */
|
||
Disabled = "disabled",
|
||
}
|
||
|
||
/**
|
||
* Tesseract OCR configuration.
|
||
*
|
||
* Provides fine-grained control over Tesseract OCR engine parameters.
|
||
* Most users can use the defaults, but these settings allow optimization
|
||
* for specific document types (invoices, handwriting, etc.).
|
||
*/
|
||
export interface TesseractConfig {
|
||
/** Language code (e.g., "eng", "deu", "fra") */
|
||
readonly language?: string
|
||
/**
|
||
* Page Segmentation Mode (0-13).
|
||
*
|
||
* Common values:
|
||
* - 3: Fully automatic page segmentation (native default)
|
||
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
||
* - 11: Sparse text with no particular order
|
||
*/
|
||
readonly psm?: number
|
||
/** Output format ("text" or "markdown") */
|
||
readonly outputFormat?: string
|
||
/**
|
||
* OCR Engine Mode (0-3).
|
||
*
|
||
* - 0: Legacy engine only
|
||
* - 1: Neural nets (LSTM) only (usually best)
|
||
* - 2: Legacy + LSTM
|
||
* - 3: Default (based on what's available)
|
||
*/
|
||
readonly oem?: number
|
||
/**
|
||
* Minimum confidence threshold (0.0-100.0).
|
||
*
|
||
* Words with confidence below this threshold may be rejected or flagged.
|
||
*/
|
||
readonly minConfidence?: number
|
||
/**
|
||
* Image preprocessing configuration.
|
||
*
|
||
* Controls how images are preprocessed before OCR. Can significantly
|
||
* improve quality for scanned documents or low-quality images.
|
||
*/
|
||
readonly preprocessing?: ImagePreprocessingConfig
|
||
/** Enable automatic table detection and reconstruction */
|
||
readonly enableTableDetection?: boolean
|
||
/** Minimum confidence threshold for table detection (0.0-1.0) */
|
||
readonly tableMinConfidence?: number
|
||
/** Column threshold for table detection (pixels) */
|
||
readonly tableColumnThreshold?: number
|
||
/** Row threshold ratio for table detection (0.0-1.0) */
|
||
readonly tableRowThresholdRatio?: number
|
||
/** Enable OCR result caching */
|
||
readonly useCache?: boolean
|
||
/** Use pre-adapted templates for character classification */
|
||
readonly classifyUsePreAdaptedTemplates?: boolean
|
||
/** Enable N-gram language model */
|
||
readonly languageModelNgramOn?: boolean
|
||
/** Don't reject good words during block-level processing */
|
||
readonly tesseditDontBlkrejGoodWds?: boolean
|
||
/** Don't reject good words during row-level processing */
|
||
readonly tesseditDontRowrejGoodWds?: boolean
|
||
/** Enable dictionary correction */
|
||
readonly tesseditEnableDictCorrection?: boolean
|
||
/** Whitelist of allowed characters (empty = all allowed) */
|
||
readonly tesseditCharWhitelist?: string
|
||
/** Blacklist of forbidden characters (empty = none forbidden) */
|
||
readonly tesseditCharBlacklist?: string
|
||
/** Use primary language params model */
|
||
readonly tesseditUsePrimaryParamsModel?: boolean
|
||
/** Variable-width space detection */
|
||
readonly textordSpaceSizeIsVariable?: boolean
|
||
/** Use adaptive thresholding method */
|
||
readonly thresholdingMethod?: boolean
|
||
}
|
||
|
||
/**
|
||
* Inline text annotation — byte-range based formatting and links.
|
||
*
|
||
* Annotations reference byte offsets into the node's text content,
|
||
* enabling precise identification of formatted regions.
|
||
*/
|
||
export interface TextAnnotation {
|
||
/** Start byte offset in the node's text content (inclusive). */
|
||
readonly start: number
|
||
/** End byte offset in the node's text content (exclusive). */
|
||
readonly end: number
|
||
/** Annotation type. */
|
||
readonly kind: AnnotationKind
|
||
}
|
||
|
||
/** Text direction enumeration for HTML documents. */
|
||
export declare enum TextDirection {
|
||
/** Left-to-right text direction */
|
||
LeftToRight = "ltr",
|
||
/** Right-to-left text direction */
|
||
RightToLeft = "rtl",
|
||
/** Automatic text direction detection */
|
||
Auto = "auto",
|
||
}
|
||
|
||
/**
|
||
* Plain text and Markdown extraction result.
|
||
*
|
||
* Contains the extracted text along with statistics and,
|
||
* for Markdown files, structural elements like headers and links.
|
||
*/
|
||
export interface TextExtractionResult {
|
||
/** Extracted text content */
|
||
readonly content: string
|
||
/** Number of lines */
|
||
readonly lineCount: number
|
||
/** Number of words */
|
||
readonly wordCount: number
|
||
/** Number of characters */
|
||
readonly characterCount: number
|
||
/** Markdown headers (text only, Markdown files only) */
|
||
readonly headers?: Array<string>
|
||
/** Markdown links as (text, URL) tuples (Markdown files only) */
|
||
readonly links?: Array<Array<string>>
|
||
/** Code blocks as (language, code) tuples (Markdown files only) */
|
||
readonly codeBlocks?: Array<Array<string>>
|
||
}
|
||
|
||
/**
|
||
* Text/Markdown metadata.
|
||
*
|
||
* Extracted from plain text and Markdown files. Includes word counts and,
|
||
* for Markdown, structural elements like headers and links.
|
||
*/
|
||
export interface TextMetadata {
|
||
/** Number of lines in the document */
|
||
readonly lineCount?: number
|
||
/** Number of words */
|
||
readonly wordCount?: number
|
||
/** Number of characters */
|
||
readonly characterCount?: number
|
||
/** Markdown headers (headings text only, for Markdown files) */
|
||
readonly headers?: Array<string>
|
||
/** Markdown links as (text, url) tuples (for Markdown files) */
|
||
readonly links?: Array<Array<string>>
|
||
/** Code blocks as (language, code) tuples (for Markdown files) */
|
||
readonly codeBlocks?: Array<Array<string>>
|
||
}
|
||
|
||
export interface TokenReductionConfig {
|
||
readonly level?: ReductionLevel
|
||
readonly languageHint?: string
|
||
readonly preserveMarkdown?: boolean
|
||
readonly preserveCode?: boolean
|
||
readonly semanticThreshold?: number
|
||
readonly enableParallel?: boolean
|
||
readonly useSimd?: boolean
|
||
readonly customStopwords?: Record<string, Array<string>>
|
||
readonly preservePatterns?: Array<string>
|
||
readonly targetReduction?: number
|
||
readonly enableSemanticClustering?: boolean
|
||
}
|
||
|
||
/** Token reduction configuration. */
|
||
export interface TokenReductionOptions {
|
||
/** Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
|
||
readonly mode?: string
|
||
/** Preserve important words (capitalized, technical terms) */
|
||
readonly preserveImportantWords?: boolean
|
||
}
|
||
|
||
/**
|
||
* Configuration for tree-sitter language pack integration.
|
||
*
|
||
* Controls grammar download behavior and code analysis options.
|
||
*
|
||
* # Example (TOML)
|
||
*
|
||
* ```toml
|
||
* [tree_sitter]
|
||
* languages = ["python", "rust"]
|
||
* groups = ["web"]
|
||
*
|
||
* [tree_sitter.process]
|
||
* structure = true
|
||
* comments = true
|
||
* docstrings = true
|
||
* ```
|
||
*/
|
||
export interface TreeSitterConfig {
|
||
/**
|
||
* Enable code intelligence processing (default: true).
|
||
*
|
||
* When `false`, tree-sitter analysis is completely skipped even if
|
||
* the config section is present.
|
||
*/
|
||
readonly enabled?: boolean
|
||
/**
|
||
* Custom cache directory for downloaded grammars.
|
||
*
|
||
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
|
||
*/
|
||
readonly cacheDir?: string
|
||
/** Languages to pre-download on init (e.g., `["python", "rust"]`). */
|
||
readonly languages?: Array<string>
|
||
/** Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
|
||
readonly groups?: Array<string>
|
||
/** Processing options for code analysis. */
|
||
readonly process?: TreeSitterProcessConfig
|
||
}
|
||
|
||
/**
|
||
* Processing options for tree-sitter code analysis.
|
||
*
|
||
* Controls which analysis features are enabled when extracting code files.
|
||
*/
|
||
export interface TreeSitterProcessConfig {
|
||
/** Extract structural items (functions, classes, structs, etc.). Default: true. */
|
||
readonly structure?: boolean
|
||
/** Extract import statements. Default: true. */
|
||
readonly imports?: boolean
|
||
/** Extract export statements. Default: true. */
|
||
readonly exports?: boolean
|
||
/** Extract comments. Default: false. */
|
||
readonly comments?: boolean
|
||
/** Extract docstrings. Default: false. */
|
||
readonly docstrings?: boolean
|
||
/** Extract symbol definitions. Default: false. */
|
||
readonly symbols?: boolean
|
||
/** Include parse diagnostics. Default: false. */
|
||
readonly diagnostics?: boolean
|
||
/** Maximum chunk size in bytes. `None` disables chunking. */
|
||
readonly chunkMaxSize?: number
|
||
/** Content rendering mode for code extraction. */
|
||
readonly contentMode?: CodeContentMode
|
||
}
|
||
|
||
/** Semantic classification of an extracted URI. */
|
||
export declare enum UriKind {
|
||
/** A clickable hyperlink (web URL, file link). */
|
||
Hyperlink = "hyperlink",
|
||
/** An image or media resource reference. */
|
||
Image = "image",
|
||
/** An internal anchor or cross-reference target. */
|
||
Anchor = "anchor",
|
||
/** A citation or bibliographic reference (DOI, academic ref). */
|
||
Citation = "citation",
|
||
/** A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST). */
|
||
Reference = "reference",
|
||
/** An email address (`mailto:` link or bare email). */
|
||
Email = "email",
|
||
}
|
||
|
||
/**
|
||
* Trait for validator plugins.
|
||
*
|
||
* Validators check extraction results for quality, completeness, or correctness.
|
||
* Unlike post-processors, validator errors **fail fast** - if a validator returns
|
||
* an error, the extraction fails immediately.
|
||
*
|
||
* # Use Cases
|
||
*
|
||
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
|
||
* - **Compliance**: Verify content meets regulatory requirements
|
||
* - **Content Filtering**: Reject documents containing unwanted content
|
||
* - **Format Validation**: Verify extracted content structure
|
||
* - **Security Checks**: Scan for malicious content
|
||
*
|
||
* # Error Handling
|
||
*
|
||
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
|
||
* to the caller. Use validators for hard requirements that must be met.
|
||
*
|
||
* For non-fatal checks, use post-processors instead.
|
||
*
|
||
* # Thread Safety
|
||
*
|
||
* Validators must be thread-safe (`Send + Sync`).
|
||
*/
|
||
export interface Validator {
|
||
name(): string
|
||
/**
|
||
* Validate an extraction result.
|
||
*
|
||
* Check the extraction result and return `Ok(())` if valid, or an error
|
||
* if validation fails.
|
||
* @param result - The extraction result to validate
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns - `Ok(())` if validation passes
|
||
* - `Err(...)` if validation fails (extraction will fail)
|
||
*
|
||
* @throws - `KreuzbergError::Validation` - Validation failed
|
||
* - Any other error type appropriate for the failure
|
||
*
|
||
* # Example - Content Length Validation
|
||
*
|
||
* ```rust
|
||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||
* -> Result<()> {
|
||
* let length = result.content.len();
|
||
*
|
||
* if length < self.min {
|
||
* return Err(KreuzbergError::validation(format!(
|
||
* "Content too short: {} < {} characters",
|
||
* length, self.min
|
||
* )));
|
||
* }
|
||
*
|
||
* if length > self.max {
|
||
* return Err(KreuzbergError::validation(format!(
|
||
* "Content too long: {} > {} characters",
|
||
* length, self.max
|
||
* )));
|
||
* }
|
||
*
|
||
* Ok(())
|
||
* }
|
||
* ```
|
||
*
|
||
* # Example - Quality Score Validation
|
||
*
|
||
* ```rust
|
||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||
* -> Result<()> {
|
||
* // Check if quality_score exists in metadata
|
||
* let score = result.metadata
|
||
* .additional
|
||
* .get("quality_score")
|
||
* .and_then(|v| v.as_f64())
|
||
* .unwrap_or(0.0);
|
||
*
|
||
* if score < self.min_score {
|
||
* return Err(KreuzbergError::validation(format!(
|
||
* "Quality score too low: {} < {}",
|
||
* score, self.min_score
|
||
* )));
|
||
* }
|
||
*
|
||
* Ok(())
|
||
* }
|
||
* ```
|
||
*
|
||
* # Example - Security Validation
|
||
*
|
||
* ```rust
|
||
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
||
* -> Result<()> {
|
||
* // Check for blocked patterns
|
||
* for pattern in &self.blocked_patterns {
|
||
* if result.content.contains(pattern) {
|
||
* return Err(KreuzbergError::validation(format!(
|
||
* "Content contains blocked pattern: {}",
|
||
* pattern
|
||
* )));
|
||
* }
|
||
* }
|
||
*
|
||
* Ok(())
|
||
* }
|
||
* ```
|
||
*/
|
||
validate(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): Promise<void>
|
||
/**
|
||
* Optional: Check if this validator should run for a given result.
|
||
*
|
||
* Allows conditional validation based on MIME type, metadata, or content.
|
||
* Defaults to `true` (always run).
|
||
* @param result - The extraction result to check
|
||
*
|
||
* @param config - Extraction configuration
|
||
*
|
||
* @returns `true` if the validator should run, `false` to skip.
|
||
*/
|
||
shouldValidate?(result?: ExtractionResult | undefined | null, config?: ExtractionConfig | undefined | null): string
|
||
/**
|
||
* Optional: Get the validation priority.
|
||
*
|
||
* Higher priority validators run first. Useful for ordering validation checks
|
||
* (e.g., run cheap validations before expensive ones).
|
||
*
|
||
* Default priority is 50.
|
||
* @returns Priority value (higher = runs earlier).
|
||
*/
|
||
priority?(): string
|
||
}
|
||
|
||
/**
|
||
* Application properties from docProps/app.xml for XLSX
|
||
*
|
||
* Contains Excel-specific document metadata.
|
||
*/
|
||
export interface XlsxAppProperties {
|
||
/** Application name (e.g., "Microsoft Excel") */
|
||
readonly application?: string
|
||
/** Application version */
|
||
readonly appVersion?: string
|
||
/** Document security level */
|
||
readonly docSecurity?: number
|
||
/** Scale crop flag */
|
||
readonly scaleCrop?: boolean
|
||
/** Links up to date flag */
|
||
readonly linksUpToDate?: boolean
|
||
/** Shared document flag */
|
||
readonly sharedDoc?: boolean
|
||
/** Hyperlinks changed flag */
|
||
readonly hyperlinksChanged?: boolean
|
||
/** Company name */
|
||
readonly company?: string
|
||
/** Worksheet names */
|
||
readonly worksheetNames?: Array<string>
|
||
}
|
||
|
||
/**
|
||
* XML extraction result.
|
||
*
|
||
* Contains extracted text content from XML files along with
|
||
* structural statistics about the XML document.
|
||
*/
|
||
export interface XmlExtractionResult {
|
||
/** Extracted text content (XML structure filtered out) */
|
||
readonly content: string
|
||
/** Total number of XML elements processed */
|
||
readonly elementCount: number
|
||
/** List of unique element names found (sorted) */
|
||
readonly uniqueElements: Array<string>
|
||
}
|
||
|
||
/**
|
||
* XML metadata extracted during XML parsing.
|
||
*
|
||
* Provides statistics about XML document structure.
|
||
*/
|
||
export interface XmlMetadata {
|
||
/** Total number of XML elements processed */
|
||
readonly elementCount?: number
|
||
/** List of unique element tag names (sorted) */
|
||
readonly uniqueElements?: Array<string>
|
||
}
|
||
|
||
/** YAKE-specific parameters. */
|
||
export interface YakeParams {
|
||
/**
|
||
* Window size for co-occurrence analysis (default: 2).
|
||
*
|
||
* Controls the context window for computing co-occurrence statistics.
|
||
*/
|
||
readonly windowSize?: number
|
||
}
|
||
|
||
/** Year range for bibliographic metadata. */
|
||
export interface YearRange {
|
||
readonly min?: number
|
||
readonly max?: number
|
||
readonly years: Array<number>
|
||
}
|
||
|
||
/** List names of all registered document extractors. */
|
||
export declare function listDocumentExtractors(): Array<string>;
|
||
|
||
/**
|
||
* List the names of all registered embedding backends.
|
||
*
|
||
* Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
||
* bindings.
|
||
*/
|
||
export declare function listEmbeddingBackends(): Array<string>;
|
||
|
||
/**
|
||
* List the names of all available embedding presets.
|
||
*
|
||
* Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
||
*/
|
||
export declare function listEmbeddingPresets(): Array<string>;
|
||
|
||
/**
|
||
* List all registered OCR backends.
|
||
*
|
||
* Returns the names of all OCR backends currently registered in the global registry.
|
||
* @returns A vector of OCR backend names.
|
||
*/
|
||
export declare function listOcrBackends(): Array<string>;
|
||
|
||
/**
|
||
* List all registered post-processor names.
|
||
*
|
||
* Returns a vector of all post-processor names currently registered in the
|
||
* global registry.
|
||
* @returns - `Ok(Vec<String>)` - Vector of post-processor names
|
||
* - `Err(...)` if the registry lock is poisoned
|
||
*/
|
||
export declare function listPostProcessors(): Array<string>;
|
||
|
||
/**
|
||
* List names of all registered renderers.
|
||
* @throws Returns an error if the registry lock is poisoned.
|
||
*/
|
||
export declare function listRenderers(): Array<string>;
|
||
|
||
/** List names of all registered validators. */
|
||
export declare function listValidators(): Array<string>;
|
||
|
||
export declare function registerDocumentExtractor(impl: DocumentExtractor): void;
|
||
|
||
export declare function registerEmbeddingBackend(impl: EmbeddingBackend): void;
|
||
|
||
export declare function registerOcrBackend(impl: OcrBackend): void;
|
||
|
||
export declare function registerPostProcessor(impl: PostProcessor): void;
|
||
|
||
export declare function registerRenderer(impl: Renderer): void;
|
||
|
||
export declare function registerValidator(impl: Validator): void;
|
||
|
||
/**
|
||
* Render a single PDF page to PNG bytes.
|
||
*
|
||
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
||
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
||
* @param pdf_bytes - Raw PDF file bytes
|
||
*
|
||
* @param page_index - Zero-based page index
|
||
*
|
||
* @param dpi - Resolution in dots per inch (default: 150)
|
||
*
|
||
* @param password - Optional password for encrypted PDFs
|
||
*
|
||
* @throws Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
||
* or rendered, or if `page_index` is out of range.
|
||
*/
|
||
export declare function renderPdfPageToPng(pdfBytes: Uint8Array, pageIndex: number, dpi?: number | undefined | null, password?: string | undefined | null): Uint8Array;
|
||
|
||
export declare function unregisterDocumentExtractor(name: string): void;
|
||
|
||
export declare function unregisterEmbeddingBackend(name: string): void;
|
||
|
||
export declare function unregisterOcrBackend(name: string): void;
|
||
|
||
export declare function unregisterPostProcessor(name: string): void;
|
||
|
||
export declare function unregisterRenderer(name: string): void;
|
||
|
||
export declare function unregisterValidator(name: string): void;
|