// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef /* eslint-disable */ export type JsonValue = string | number | boolean | null | JsonValue[] | { [key: string]: JsonValue }; /** * Extract content from multiple byte arrays concurrently. * * This function processes multiple byte arrays in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via `ExtractionConfig::max_concurrent_extractions` or defaults * to `(num_cpus * 1.5).ceil()`. * * Each item can optionally specify a [`FileExtractionConfig`] that overrides specific * fields from the batch-level `config`. Pass `None` as the config to use * the batch-level defaults for that item. * @param items - Vector of `BatchBytesItem` structs, each containing content bytes, MIME type, and optional per-item configuration overrides. * * @param config - Batch-level extraction configuration * * @returns A vector of `ExtractionResult` in the same order as the input items. */ export declare function batchExtractBytes(items: Array, config?: ExtractionConfig | undefined | null): Promise>; /** * Synchronous wrapper for `batch_extract_bytes`. * * Uses the global Tokio runtime for optimal performance. * With the `tokio-runtime` feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation * that iterates through items and calls `extract_bytes_sync()`. */ export declare function batchExtractBytesSync(items: Array, config?: ExtractionConfig | undefined | null): Array; /** * Extract content from multiple files concurrently. * * This function processes multiple files in parallel, automatically managing * concurrency to prevent resource exhaustion. The concurrency limit can be * configured via `ExtractionConfig::max_concurrent_extractions` or defaults * to `(num_cpus * 1.5).ceil()`. * * Each file can optionally specify a [`FileExtractionConfig`] that overrides specific * fields from the batch-level `config`. Pass `None` for a file to use the batch defaults. * Batch-level settings like `max_concurrent_extractions` and `use_cache` are always * taken from the batch-level `config`. * @param items - Vector of `BatchFileItem` structs, each containing a path and optional per-file configuration overrides. * * @param config - Batch-level extraction configuration (provides defaults and batch settings) * * @returns A vector of `ExtractionResult` in the same order as the input items. * * @throws Individual file errors are captured in the result metadata. System errors * (IO, RuntimeError equivalents) will bubble up and fail the entire batch. */ export declare function batchExtractFiles(items: Array, config?: ExtractionConfig | undefined | null): Promise>; /** * Synchronous wrapper for `batch_extract_files`. * * Uses the global Tokio runtime for optimal performance. * Only available with `tokio-runtime` (WASM has no filesystem). */ export declare function batchExtractFilesSync(items: Array, config?: ExtractionConfig | undefined | null): Array; /** * Clear all document extractors from the global registry. * * Calls `shutdown()` on every registered extractor, then empties the registry. * @throws - Any error returned by an extractor's `shutdown()` method. The first error * encountered stops processing of remaining extractors. */ export declare function clearDocumentExtractors(): void; /** * Clear all embedding backends from the global registry. * * Calls `shutdown()` on every registered backend, then empties the registry. * @throws - Any error returned by a backend's `shutdown()` method. The first error * encountered stops processing of remaining backends. */ export declare function clearEmbeddingBackends(): void; /** * Clear all OCR backends from the global registry. * * Removes all OCR backends and calls their `shutdown()` methods. * @returns - `Ok(())` if all backends were cleared successfully * - `Err(...)` if any shutdown method failed */ export declare function clearOcrBackends(): void; /** Remove all registered post-processors. */ export declare function clearPostProcessors(): void; /** * Clear all renderers from the global registry. * * Removes every renderer, including the built-in defaults (markdown, html, * djot, plain). After calling this no renderers are registered; re-register * as needed. * @throws Returns an error if the registry lock is poisoned. */ export declare function clearRenderers(): void; /** Remove all registered validators. */ export declare function clearValidators(): void; /** * Compare two extraction results and return a structured diff. * * The comparison is purely structural — no I/O, no side effects. All fields * of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`]. * @param a - — the "before" extraction result * * @param b - — the "after" extraction result * * @param opts - — controls which sections are compared and optional truncation */ export declare function compare(a?: ExtractionResult | undefined | null, b?: ExtractionResult | undefined | null, opts?: DiffOptions | undefined | null): ExtractionDiff; /** * Detect the MIME type of a file at the given path. * * Uses the file extension and optionally the file content to determine the MIME type. * Set `check_exists` to `true` to verify the file exists before detection. */ export declare function detectMimeType(path: string, checkExists: boolean): string; /** * Detect MIME type from raw file bytes. * * Uses magic byte signatures to detect file type from content. * Falls back to `infer` crate for comprehensive detection. * * For ZIP-based files, inspects contents to distinguish Office Open XML * formats (DOCX, XLSX, PPTX) from plain ZIP archives. * @param content - Raw file bytes * * @returns The detected MIME type string. * * @throws Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined. */ export declare function detectMimeTypeFromBytes(content: Uint8Array): string; /** * Embed a list of texts using the configured embedding model. * * Returns a 2D vector where each inner vector is the embedding for the corresponding text. */ export declare function embedTexts(texts: Array, config?: EmbeddingConfig | undefined | null): Array>; /** * Generate embeddings asynchronously for a list of text strings. * * This is the async counterpart to [`embed_texts`]. It offloads the blocking * ONNX inference work to a dedicated blocking thread pool via Tokio's * `spawn_blocking`, keeping the async executor free. * * Returns one embedding vector per input text in the same order. * @param texts - Vec of strings to embed (owned, sent to blocking thread) * * @param config - Embedding configuration specifying model, batch size, and normalization * * @throws - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed * - `KreuzbergError::Embedding` if the preset name is unknown, model download fails, * or the blocking inference task panics */ export declare function embedTextsAsync(texts: Array, config?: EmbeddingConfig | undefined | null): Promise>>; /** * Extract content from a byte array. * * This is the main entry point for in-memory extraction. It performs the following steps: * 1. Validate MIME type * 2. Handle legacy format conversion if needed * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * @param content - The byte array to extract * * @param mime_type - MIME type of the content * * @param config - Extraction configuration * * @returns An `ExtractionResult` containing the extracted content and metadata. * * @throws Returns `KreuzbergError::Validation` if MIME type is invalid. * Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported. */ export declare function extractBytes(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): Promise; /** * Synchronous wrapper for `extract_bytes`. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. * * With the `tokio-runtime` feature, this blocks the current thread using the global * Tokio runtime. Without it (WASM), this calls a truly synchronous implementation. */ export declare function extractBytesSync(content: Uint8Array, mimeType: string, config?: ExtractionConfig | undefined | null): ExtractionResult; /** * Extract content from a file. * * This is the main entry point for file-based extraction. It performs the following steps: * 1. Check cache for existing result (if caching enabled) * 2. Detect or validate MIME type * 3. Select appropriate extractor from registry * 4. Extract content * 5. Run post-processing pipeline * 6. Store result in cache (if caching enabled) * @param path - Path to the file to extract * * @param mime_type - Optional MIME type override. If None, will be auto-detected * * @param config - Extraction configuration * * @returns An `ExtractionResult` containing the extracted content and metadata. * * @throws Returns `KreuzbergError::Io` if the file doesn't exist (NotFound) or for other file I/O errors. * Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported. */ export declare function extractFile(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): Promise; /** * Synchronous wrapper for `extract_file`. * * This is a convenience function that blocks the current thread until extraction completes. * For async code, use `extract_file` directly. * * Uses the global Tokio runtime for 100x+ performance improvement over creating * a new runtime per call. Always uses the global runtime to avoid nested runtime issues. * * This function is only available with the `tokio-runtime` feature. For WASM targets, * use a truly synchronous extraction approach instead. */ export declare function extractFileSync(path: string, mimeType?: string | undefined | null, config?: ExtractionConfig | undefined | null): ExtractionResult; /** * Get an embedding preset by name. * * Returns `None` if no preset with the given name exists. Returns an owned * clone so the value is safe to pass across FFI boundaries. */ export declare function getEmbeddingPreset(name: string): EmbeddingPreset | null; /** * Get file extensions for a given MIME type. * * Returns all known file extensions that map to the specified MIME type. * @param mime_type - The MIME type to look up * * @returns A vector of file extensions (without leading dot) for the MIME type. */ export declare function getExtensionsForMime(mimeType: string): Array; /** * Hardware acceleration configuration for ONNX Runtime models. * * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used * for inference in layout detection and embedding generation. */ export interface AccelerationConfig { /** Execution provider to use for ONNX inference. */ readonly provider?: ExecutionProviderType /** GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */ readonly deviceId?: number } /** Types of inline text annotations. */ export type AnnotationKind = | { annotation_type: 'bold' } | { annotation_type: 'italic' } | { annotation_type: 'underline' } | { annotation_type: 'strikethrough' } | { annotation_type: 'code' } | { annotation_type: 'subscript' } | { annotation_type: 'superscript' } | { annotation_type: 'link'; url: string; title: string } | { annotation_type: 'highlight' } | { annotation_type: 'color'; value: string } | { annotation_type: 'font_size'; value: string } | { annotation_type: 'custom'; name: string; value: string } /** * A single file extracted from an archive. * * When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction * enabled, each processable file produces its own full `ExtractionResult`. */ export interface ArchiveEntry { /** Archive-relative file path (e.g. "folder/document.pdf"). */ readonly path: string /** Detected MIME type of the file. */ readonly mimeType: string /** Full extraction result for this file. */ readonly result: ExtractionResult } /** * Archive (ZIP/TAR/7Z) metadata. * * Extracted from compressed archive files containing file lists and size information. */ export interface ArchiveMetadata { /** Archive format ("ZIP", "TAR", "7Z", etc.) */ readonly format?: string /** Total number of files in the archive */ readonly fileCount?: number /** List of file paths within the archive */ readonly fileList?: Array /** Total uncompressed size in bytes */ readonly totalSize?: number /** Compressed size in bytes (if available) */ readonly compressedSize?: number } /** * Batch item for byte array extraction. * * Used with `batch_extract_bytes` and `batch_extract_bytes_sync` * to represent a single item in a batch extraction job. */ export interface BatchBytesItem { /** The content bytes to extract from */ readonly content: Uint8Array /** MIME type of the content (e.g., "application/pdf", "text/html") */ readonly mimeType: string /** Per-item configuration overrides (None uses batch-level defaults) */ readonly config?: FileExtractionConfig } /** * Batch item for file extraction. * * Used with `batch_extract_files` and `batch_extract_files_sync` * to represent a single file in a batch extraction job. */ export interface BatchFileItem { /** Path to the file to extract from */ readonly path: string /** Per-file configuration overrides (None uses batch-level defaults) */ readonly config?: FileExtractionConfig } /** Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right. */ export interface BBox { readonly x1: number readonly y1: number readonly x2: number readonly y2: number } /** BibTeX bibliography metadata. */ export interface BibtexMetadata { /** Number of entries in the bibliography. */ readonly entryCount?: number readonly citationKeys?: Array readonly authors?: Array readonly yearRange?: YearRange readonly entryTypes?: Record } /** Types of block-level elements in Djot. */ export declare enum BlockType { Paragraph = "paragraph", Heading = "heading", Blockquote = "blockquote", CodeBlock = "code_block", ListItem = "list_item", OrderedList = "ordered_list", BulletList = "bullet_list", TaskList = "task_list", DefinitionList = "definition_list", DefinitionTerm = "definition_term", DefinitionDescription = "definition_description", Div = "div", Section = "section", ThematicBreak = "thematic_break", RawBlock = "raw_block", MathDisplay = "math_display", } /** Bounding box coordinates for element positioning. */ export interface BoundingBox { /** Left x-coordinate */ readonly x0?: number /** Bottom y-coordinate */ readonly y0?: number /** Right x-coordinate */ readonly x1?: number /** Top y-coordinate */ readonly y1?: number } export interface CacheStats { readonly totalFiles: number readonly totalSizeMb: number readonly availableSpaceMb: number readonly oldestFileAgeDays: number readonly newestFileAgeDays: number } /** * A single changed cell within a table. * * Defined here (rather than only in `crate::diff`) so `RevisionDelta` can * reference it unconditionally, without requiring the `diff` Cargo feature. * `crate::diff` re-exports this type verbatim. */ export interface CellChange { /** Zero-based row index. */ readonly row: number /** Zero-based column index. */ readonly col: number /** Value before the change. */ readonly from: string /** Value after the change. */ readonly to: string } /** * A text chunk with optional embedding and metadata. * * Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk * contains the text content, optional embedding vector (if embedding generation * is configured), and metadata about its position in the document. */ export interface Chunk { /** The text content of this chunk. */ readonly content: string /** * Semantic structural classification of this chunk. * * Assigned by the heuristic classifier based on content patterns and * heading context. Defaults to `ChunkType::Unknown` when no rule matches. */ readonly chunkType: ChunkType /** * Optional embedding vector for this chunk. * * Only populated when `EmbeddingConfig` is provided in chunking configuration. * The dimensionality depends on the chosen embedding model. */ readonly embedding?: Array /** Metadata about this chunk's position and properties. */ readonly metadata: ChunkMetadata } /** * Type of text chunker to use. * * # Variants * * * `Text` - Generic text splitter, splits on whitespace and punctuation * * `Markdown` - Markdown-aware splitter, preserves formatting and structure * * `Yaml` - YAML-aware splitter, creates one chunk per top-level key * * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at * embedding-based topic shifts tuned by `topic_threshold` (default 0.75, * lower = more splits). Without an embedding, falls back to a * structural-boundary heuristic (ALL-CAPS headers, numbered sections, * blank-line paragraphs) and merges groups into chunks capped at * `max_characters` (default 1000). `topic_threshold` has no effect in the * fallback path. For best results, pair with an embedding model. */ export declare enum ChunkerType { Text = "text", Markdown = "markdown", Yaml = "yaml", Semantic = "semantic", } /** * Chunking configuration. * * Configures text chunking for document content, including chunk size, * overlap, trimming behavior, and optional embeddings. * * Use `..Default::default()` when constructing to allow for future field additions: * ```rust * let config = ChunkingConfig { * max_characters: 500, * ..Default::default() * }; * ``` */ export interface ChunkingConfig { /** * Maximum size per chunk (in units determined by `sizing`). * * When `sizing` is `Characters` (default), this is the max character count. * When using token-based sizing, this is the max token count. * * Default: 1000 */ readonly maxCharacters?: number /** * Overlap between chunks (in units determined by `sizing`). * * Default: 200 */ readonly overlap?: number /** * Whether to trim whitespace from chunk boundaries. * * Default: true */ readonly trim?: boolean /** * Type of chunker to use (Text or Markdown). * * Default: Text */ readonly chunkerType?: ChunkerType /** Optional embedding configuration for chunk embeddings. */ readonly embedding?: EmbeddingConfig /** Use a preset configuration (overrides individual settings if provided). */ readonly preset?: string /** * How to measure chunk size. * * Default: `Characters` (Unicode character count). * Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing. */ readonly sizing?: ChunkSizing /** * When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy * path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string. * * This is useful for RAG pipelines where each chunk needs self-contained * context about its position in the document structure. * * Default: `false` */ readonly prependHeadingContext?: boolean /** * Optional cosine similarity threshold for semantic topic boundary detection. * * Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is * provided. You almost never need to set this. When omitted, defaults to * `0.75` which works well for most documents. Lower values detect more * topic boundaries (more, smaller chunks); higher values detect fewer. * Range: `0.0..=1.0`. */ readonly topicThreshold?: number } /** Metadata about a chunk's position in the original document. */ export interface ChunkMetadata { /** Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */ readonly byteStart: number /** Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */ readonly byteEnd: number /** * Number of tokens in this chunk (if available). * * This is calculated by the embedding model's tokenizer if embeddings are enabled. */ readonly tokenCount?: number /** Zero-based index of this chunk in the document. */ readonly chunkIndex: number /** Total number of chunks in the document. */ readonly totalChunks: number /** * First page number this chunk spans (1-indexed). * * Only populated when page tracking is enabled in extraction configuration. */ readonly firstPage?: number /** * Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks). * * Only populated when page tracking is enabled in extraction configuration. */ readonly lastPage?: number /** * Heading context when using Markdown chunker. * * Contains the heading hierarchy this chunk falls under. * Only populated when `ChunkerType::Markdown` is used. */ readonly headingContext?: HeadingContext /** * Indices into `ExtractionResult.images` for images on pages covered by this chunk. * * Contains zero-based indices into the top-level `images` collection for every * image whose `page_number` falls within `[first_page, last_page]`. * Empty when image extraction is disabled or the chunk spans no pages with images. */ readonly imageIndices: Array } /** * How chunk size is measured. * * Defaults to `Characters` (Unicode character count). When using token-based sizing, * chunks are sized by token count according to the specified tokenizer. * * Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer * available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers * (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`). */ export type ChunkSizing = | { type: 'characters' } | { type: 'tokenizer'; model: string; cacheDir: string } /** * Semantic structural classification of a text chunk. * * Assigned by the heuristic classifier in `chunking::classifier`. * Defaults to `Unknown` when no rule matches. * Designed to be extended in future versions without breaking changes. */ export declare enum ChunkType { /** Section heading or document title. */ Heading = "heading", /** Party list: names, addresses, and signatories. */ PartyList = "party_list", /** Definition clause ("X means…", "X shall mean…"). */ Definitions = "definitions", /** Operative clause containing legal/contractual action verbs. */ OperativeClause = "operative_clause", /** Signature block with signatures, names, and dates. */ SignatureBlock = "signature_block", /** Schedule, annex, appendix, or exhibit section. */ Schedule = "schedule", /** Table-like content with aligned columns or repeated patterns. */ TableLike = "table_like", /** Mathematical formula or equation. */ Formula = "formula", /** Code block or preformatted content. */ CodeBlock = "code_block", /** Embedded or referenced image content. */ Image = "image", /** Organizational chart or hierarchy diagram. */ OrgChart = "org_chart", /** Diagram, figure, or visual illustration. */ Diagram = "diagram", /** Unclassified or mixed content. */ Unknown = "unknown", } /** Citation file metadata (RIS, PubMed, EndNote). */ export interface CitationMetadata { readonly citationCount?: number readonly format?: string readonly authors?: Array readonly yearRange?: YearRange readonly dois?: Array readonly keywords?: Array } /** * Content rendering mode for code extraction. * * Controls how extracted code content is represented in the `content` field * of `ExtractionResult`. */ export declare enum CodeContentMode { /** Use TSLP semantic chunks as content (default). */ Chunks = "chunks", /** Use raw source code as content. */ Raw = "raw", /** Emit function/class headings + docstrings (no code bodies). */ Structure = "structure", } /** * Cross-extractor content filtering configuration. * * Controls whether "furniture" content (headers, footers, page numbers, * watermarks, repeating text) is included in or stripped from extraction * results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.) * with format-specific implementation. * * When `None` on `ExtractionConfig`, each extractor uses its current * default behavior unchanged. */ export interface ContentFilterConfig { /** * Include running headers in extraction output. * * - PDF: Disables top-margin furniture stripping and prevents the layout * model from treating `PageHeader`-classified regions as furniture. * - DOCX: Includes document headers in text output. * - RTF/ODT: Headers already included; this is a no-op when true. * - HTML/EPUB: Keeps `

` element content. * * Default: `false` (headers are stripped or excluded). */ readonly includeHeaders?: boolean /** * Include running footers in extraction output. * * - PDF: Disables bottom-margin furniture stripping and prevents the layout * model from treating `PageFooter`-classified regions as furniture. * - DOCX: Includes document footers in text output. * - RTF/ODT: Footers already included; this is a no-op when true. * - HTML/EPUB: Keeps `