1562 lines
52 KiB
TOML
Generated
1562 lines
52 KiB
TOML
Generated
# This file is auto-generated by alef. DO NOT EDIT.
|
|
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
language = "C"
|
|
include_guard = "KREUZBERG_H"
|
|
pragma_once = true
|
|
autogen_warning = "/* This file is auto-generated by alef. DO NOT EDIT. */"
|
|
after_includes = """
|
|
/* Opaque type forward declarations */
|
|
/**
|
|
* Hardware acceleration configuration for ONNX Runtime models.
|
|
*
|
|
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|
* for inference in layout detection and embedding generation.
|
|
* \\code
|
|
* use kreuzberg::AccelerationConfig;
|
|
*
|
|
* // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
|
|
* let config = AccelerationConfig::default();
|
|
*
|
|
* // Force CPU only
|
|
* let config = AccelerationConfig {
|
|
* provider: kreuzberg::ExecutionProviderType::Cpu,
|
|
* ..Default::default()
|
|
* };
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGAccelerationConfig KREUZBERGAccelerationConfig;
|
|
/**
|
|
* Types of inline text annotations.
|
|
*/
|
|
typedef struct KREUZBERGAnnotationKind KREUZBERGAnnotationKind;
|
|
/**
|
|
* A single file extracted from an archive.
|
|
*
|
|
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
|
* enabled, each processable file produces its own full `ExtractionResult`.
|
|
*/
|
|
typedef struct KREUZBERGArchiveEntry KREUZBERGArchiveEntry;
|
|
/**
|
|
* Archive (ZIP/TAR/7Z) metadata.
|
|
*
|
|
* Extracted from compressed archive files containing file lists and size information.
|
|
*/
|
|
typedef struct KREUZBERGArchiveMetadata KREUZBERGArchiveMetadata;
|
|
/**
|
|
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
|
|
*/
|
|
typedef struct KREUZBERGBBox KREUZBERGBBox;
|
|
/**
|
|
* Batch item for byte array extraction.
|
|
*
|
|
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
|
* to represent a single item in a batch extraction job.
|
|
*/
|
|
typedef struct KREUZBERGBatchBytesItem KREUZBERGBatchBytesItem;
|
|
/**
|
|
* Batch item for file extraction.
|
|
*
|
|
* Used with `batch_extract_files` and `batch_extract_files_sync`
|
|
* to represent a single file in a batch extraction job.
|
|
*/
|
|
typedef struct KREUZBERGBatchFileItem KREUZBERGBatchFileItem;
|
|
/**
|
|
* BibTeX bibliography metadata.
|
|
*/
|
|
typedef struct KREUZBERGBibtexMetadata KREUZBERGBibtexMetadata;
|
|
/**
|
|
* Types of block-level elements in Djot.
|
|
*/
|
|
typedef struct KREUZBERGBlockType KREUZBERGBlockType;
|
|
/**
|
|
* Bounding box coordinates for element positioning.
|
|
*/
|
|
typedef struct KREUZBERGBoundingBox KREUZBERGBoundingBox;
|
|
typedef struct KREUZBERGCacheStats KREUZBERGCacheStats;
|
|
/**
|
|
* A single changed cell within a table.
|
|
*
|
|
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
|
* `crate::diff` re-exports this type verbatim.
|
|
*/
|
|
typedef struct KREUZBERGCellChange KREUZBERGCellChange;
|
|
/**
|
|
* A text chunk with optional embedding and metadata.
|
|
*
|
|
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|
* contains the text content, optional embedding vector (if embedding generation
|
|
* is configured), and metadata about its position in the document.
|
|
*/
|
|
typedef struct KREUZBERGChunk KREUZBERGChunk;
|
|
/**
|
|
* Metadata about a chunk's position in the original document.
|
|
*/
|
|
typedef struct KREUZBERGChunkMetadata KREUZBERGChunkMetadata;
|
|
/**
|
|
* How chunk size is measured.
|
|
*
|
|
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
|
* chunks are sized by token count according to the specified tokenizer.
|
|
*
|
|
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
|
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
|
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
|
*/
|
|
typedef struct KREUZBERGChunkSizing KREUZBERGChunkSizing;
|
|
/**
|
|
* Semantic structural classification of a text chunk.
|
|
*
|
|
* Assigned by the heuristic classifier in `chunking::classifier`.
|
|
* Defaults to `Unknown` when no rule matches.
|
|
* Designed to be extended in future versions without breaking changes.
|
|
*/
|
|
typedef struct KREUZBERGChunkType KREUZBERGChunkType;
|
|
/**
|
|
* Type of text chunker to use.
|
|
*
|
|
* # Variants
|
|
*
|
|
* * `Text` - Generic text splitter, splits on whitespace and punctuation
|
|
* * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
|
* * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
|
* * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
|
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
|
* lower = more splits). Without an embedding, falls back to a
|
|
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
|
* blank-line paragraphs) and merges groups into chunks capped at
|
|
* `max_characters` (default 1000). `topic_threshold` has no effect in the
|
|
* fallback path. For best results, pair with an embedding model.
|
|
*/
|
|
typedef struct KREUZBERGChunkerType KREUZBERGChunkerType;
|
|
/**
|
|
* Chunking configuration.
|
|
*
|
|
* Configures text chunking for document content, including chunk size,
|
|
* overlap, trimming behavior, and optional embeddings.
|
|
*
|
|
* Use `..Default::default()` when constructing to allow for future field additions:
|
|
* ```rust
|
|
* let config = ChunkingConfig {
|
|
* max_characters: 500,
|
|
* ..Default::default()
|
|
* };
|
|
* ```
|
|
*/
|
|
typedef struct KREUZBERGChunkingConfig KREUZBERGChunkingConfig;
|
|
/**
|
|
* Citation file metadata (RIS, PubMed, EndNote).
|
|
*/
|
|
typedef struct KREUZBERGCitationMetadata KREUZBERGCitationMetadata;
|
|
/**
|
|
* Content rendering mode for code extraction.
|
|
*
|
|
* Controls how extracted code content is represented in the `content` field
|
|
* of `ExtractionResult`.
|
|
*/
|
|
typedef struct KREUZBERGCodeContentMode KREUZBERGCodeContentMode;
|
|
/**
|
|
* Cross-extractor content filtering configuration.
|
|
*
|
|
* Controls whether "furniture" content (headers, footers, page numbers,
|
|
* watermarks, repeating text) is included in or stripped from extraction
|
|
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
|
* with format-specific implementation.
|
|
*
|
|
* When `None` on `ExtractionConfig`, each extractor uses its current
|
|
* default behavior unchanged.
|
|
*/
|
|
typedef struct KREUZBERGContentFilterConfig KREUZBERGContentFilterConfig;
|
|
/**
|
|
* Content layer classification for document nodes.
|
|
*
|
|
* Replaces separate body/furniture arrays with per-node granularity.
|
|
*/
|
|
typedef struct KREUZBERGContentLayer KREUZBERGContentLayer;
|
|
/**
|
|
* JATS contributor with role.
|
|
*/
|
|
typedef struct KREUZBERGContributorRole KREUZBERGContributorRole;
|
|
/**
|
|
* Dublin Core metadata from docProps/core.xml
|
|
*
|
|
* Contains standard metadata fields defined by the Dublin Core standard
|
|
* and Office-specific extensions.
|
|
*/
|
|
typedef struct KREUZBERGCoreProperties KREUZBERGCoreProperties;
|
|
/**
|
|
* CSV/TSV file metadata.
|
|
*/
|
|
typedef struct KREUZBERGCsvMetadata KREUZBERGCsvMetadata;
|
|
/**
|
|
* dBASE field information.
|
|
*/
|
|
typedef struct KREUZBERGDbfFieldInfo KREUZBERGDbfFieldInfo;
|
|
/**
|
|
* dBASE (DBF) file metadata.
|
|
*/
|
|
typedef struct KREUZBERGDbfMetadata KREUZBERGDbfMetadata;
|
|
/**
|
|
* MIME type detection response.
|
|
*/
|
|
typedef struct KREUZBERGDetectResponse KREUZBERGDetectResponse;
|
|
/**
|
|
* Page-level detection result containing all detections and page metadata.
|
|
*/
|
|
typedef struct KREUZBERGDetectionResult KREUZBERGDetectionResult;
|
|
/**
|
|
* A single contiguous hunk in a unified diff.
|
|
*/
|
|
typedef struct KREUZBERGDiffHunk KREUZBERGDiffHunk;
|
|
/**
|
|
* A single line in a unified-diff hunk.
|
|
*
|
|
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
|
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
|
* `crate::diff` re-exports this type verbatim.
|
|
*/
|
|
typedef struct KREUZBERGDiffLine KREUZBERGDiffLine;
|
|
/**
|
|
* Options controlling how two `ExtractionResult` values are compared.
|
|
*/
|
|
typedef struct KREUZBERGDiffOptions KREUZBERGDiffOptions;
|
|
/**
|
|
* Comprehensive Djot document structure with semantic preservation.
|
|
*
|
|
* This type captures the full richness of Djot markup, including:
|
|
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
|
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
|
* - Attributes (classes, IDs, key-value pairs)
|
|
* - Links, images, footnotes
|
|
* - Math expressions (inline and display)
|
|
* - Tables with full structure
|
|
*
|
|
* Available when the `djot` feature is enabled.
|
|
*/
|
|
typedef struct KREUZBERGDjotContent KREUZBERGDjotContent;
|
|
/**
|
|
* Image element in Djot.
|
|
*/
|
|
typedef struct KREUZBERGDjotImage KREUZBERGDjotImage;
|
|
/**
|
|
* Link element in Djot.
|
|
*/
|
|
typedef struct KREUZBERGDjotLink KREUZBERGDjotLink;
|
|
/**
|
|
* Trait for document extractor plugins.
|
|
*
|
|
* Implement this trait to add support for new document formats or to override
|
|
* built-in extraction behavior with custom logic.
|
|
*
|
|
* # Return Type
|
|
*
|
|
* Extractors return `InternalDocument`, a flat intermediate representation.
|
|
* The pipeline converts this into the public `ExtractionResult` via the
|
|
* derivation step.
|
|
*
|
|
* # Priority System
|
|
*
|
|
* When multiple extractors support the same MIME type, the registry selects
|
|
* the extractor with the highest priority value. Use this to:
|
|
* - Override built-in extractors (priority > 50)
|
|
* - Provide fallback extractors (priority < 50)
|
|
* - Implement specialized extractors for specific use cases
|
|
*
|
|
* Default priority is 50.
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
|
* \\code
|
|
* use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
* use kreuzberg::{Result, ExtractionConfig};
|
|
* use kreuzberg::types::internal::InternalDocument;
|
|
* use async_trait::async_trait;
|
|
* use std::path::Path;
|
|
*
|
|
* /// Custom PDF extractor with premium features
|
|
* struct PremiumPdfExtractor;
|
|
*
|
|
* impl Plugin for PremiumPdfExtractor {
|
|
* fn name(&self) -> &str { "premium-pdf" }
|
|
* fn version(&self) -> String { "2.0.0".to_string() }
|
|
* fn initialize(&self) -> Result<()> { Ok(()) }
|
|
* fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
* }
|
|
*
|
|
* #[async_trait]
|
|
* impl DocumentExtractor for PremiumPdfExtractor {
|
|
* async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
|
* -> Result<InternalDocument> {
|
|
* // Premium extraction logic with better accuracy
|
|
* let mut doc = InternalDocument::new("pdf");
|
|
* // ... populate doc.elements, doc.metadata, etc.
|
|
* Ok(doc)
|
|
* }
|
|
*
|
|
* fn supported_mime_types(&self) -> &[&str] {
|
|
* &["application/pdf"]
|
|
* }
|
|
*
|
|
* fn priority(&self) -> i32 {
|
|
* 100 // Higher than default (50) - will be preferred
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGDocumentExtractor KREUZBERGDocumentExtractor;
|
|
/**
|
|
* A single node in the document tree.
|
|
*
|
|
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
|
* for tree structure, and metadata like page number, bounding box, and content layer.
|
|
*/
|
|
typedef struct KREUZBERGDocumentNode KREUZBERGDocumentNode;
|
|
/**
|
|
* A resolved relationship between two nodes in the document tree.
|
|
*/
|
|
typedef struct KREUZBERGDocumentRelationship KREUZBERGDocumentRelationship;
|
|
/**
|
|
* A single tracked change embedded in a document.
|
|
*
|
|
* Populated by per-format extractors that understand change-tracking metadata
|
|
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, â¦). Every
|
|
* extractor defaults to `ExtractionResult.revisions = None` until a
|
|
* format-specific implementation is added.
|
|
*/
|
|
typedef struct KREUZBERGDocumentRevision KREUZBERGDocumentRevision;
|
|
/**
|
|
* Top-level structured document representation.
|
|
*
|
|
* A flat array of nodes with index-based parent/child references forming a tree.
|
|
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
|
* to iterate over top-level content by layer.
|
|
*
|
|
* # Validation
|
|
*
|
|
* Call `validate()` after construction to verify all node indices are in bounds
|
|
* and parent-child relationships are bidirectionally consistent.
|
|
*/
|
|
typedef struct KREUZBERGDocumentStructure KREUZBERGDocumentStructure;
|
|
/**
|
|
* Application properties from docProps/app.xml for DOCX
|
|
*
|
|
* Contains Word-specific document statistics and metadata.
|
|
*/
|
|
typedef struct KREUZBERGDocxAppProperties KREUZBERGDocxAppProperties;
|
|
/**
|
|
* Word document metadata.
|
|
*
|
|
* Extracted from DOCX files using shared Office Open XML metadata extraction.
|
|
* Integrates with `office_metadata` module for core/app/custom properties.
|
|
*/
|
|
typedef struct KREUZBERGDocxMetadata KREUZBERGDocxMetadata;
|
|
/**
|
|
* Semantic element extracted from document.
|
|
*
|
|
* Represents a logical unit of content with semantic classification,
|
|
* unique identifier, and metadata for tracking origin and position.
|
|
*/
|
|
typedef struct KREUZBERGElement KREUZBERGElement;
|
|
/**
|
|
* Metadata for a semantic element.
|
|
*/
|
|
typedef struct KREUZBERGElementMetadata KREUZBERGElementMetadata;
|
|
/**
|
|
* Semantic element type classification.
|
|
*
|
|
* Categorizes text content into semantic units for downstream processing.
|
|
* Supports the element types commonly found in Unstructured documents.
|
|
*/
|
|
typedef struct KREUZBERGElementType KREUZBERGElementType;
|
|
/**
|
|
* Email attachment representation.
|
|
*
|
|
* Contains metadata and optionally the content of an email attachment.
|
|
*/
|
|
typedef struct KREUZBERGEmailAttachment KREUZBERGEmailAttachment;
|
|
/**
|
|
* Configuration for email extraction.
|
|
*/
|
|
typedef struct KREUZBERGEmailConfig KREUZBERGEmailConfig;
|
|
/**
|
|
* Email extraction result.
|
|
*
|
|
* Complete representation of an extracted email message (.eml or .msg)
|
|
* including headers, body content, and attachments.
|
|
*/
|
|
typedef struct KREUZBERGEmailExtractionResult KREUZBERGEmailExtractionResult;
|
|
/**
|
|
* Email metadata extracted from .eml and .msg files.
|
|
*
|
|
* Includes sender/recipient information, message ID, and attachment list.
|
|
*/
|
|
typedef struct KREUZBERGEmailMetadata KREUZBERGEmailMetadata;
|
|
/**
|
|
* Changes to embedded archive children between two results.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddedChanges KREUZBERGEmbeddedChanges;
|
|
/**
|
|
* Diff for a single embedded archive entry that appears in both results.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddedDiff KREUZBERGEmbeddedDiff;
|
|
/**
|
|
* Embedded file descriptor extracted from the PDF name tree.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddedFile KREUZBERGEmbeddedFile;
|
|
/**
|
|
* Trait for in-process embedding backend plugins.
|
|
*
|
|
* Async to match the convention used by `OcrBackend`,
|
|
* `DocumentExtractor`, and `PostProcessor`.
|
|
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
|
|
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
|
|
* equivalent to satisfy the async signature.
|
|
*
|
|
* # Thread safety
|
|
*
|
|
* Backends must be `Send + Sync + 'static`. They are stored in
|
|
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
|
|
* pipeline. If the backend's underlying model isn't thread-safe, the backend
|
|
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
|
|
*
|
|
* # Contract
|
|
*
|
|
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
|
|
* `self.dimensions()`. The dispatcher in `embed_texts`
|
|
* validates this before returning to downstream consumers; a non-conforming
|
|
* backend surfaces as a `KreuzbergError::Validation`, not a panic.
|
|
* - `embed` may be called from any thread. Its future must be `Send`
|
|
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
|
|
* - `dimensions()` is called exactly once at registration, immediately after
|
|
* `initialize()` succeeds. The returned value is cached by the registry and
|
|
* used for all subsequent shape validation. Lazy-loading implementations can
|
|
* defer model loading into `initialize()` and report the real dimension
|
|
* afterwards. Later mutations of the backend's reported dimension are not
|
|
* observed by kreuzberg â implementations that need to change dimension
|
|
* must unregister and re-register.
|
|
* - `shutdown()` (inherited from `Plugin`) may be invoked
|
|
* concurrently with an in-flight `embed()` call. Implementations must
|
|
* tolerate this â e.g. by letting in-flight calls finish using resources
|
|
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
|
|
* shared state that isn't needed by `embed`.
|
|
*
|
|
* # Runtime
|
|
*
|
|
* The synchronous `embed_texts` entry uses
|
|
* [`tokio::task::block_in_place`] to await the trait's async `embed`, which
|
|
* requires a multi-thread tokio runtime. Callers running inside a
|
|
* `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
|
|
* or `tokio::runtime::Builder::new_current_thread()`) must use
|
|
* `embed_texts_async` instead, which awaits directly without
|
|
* `block_in_place`.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddingBackend KREUZBERGEmbeddingBackend;
|
|
/**
|
|
* Embedding configuration for text chunks.
|
|
*
|
|
* Configures embedding generation using ONNX models via the vendored embedding engine.
|
|
* Requires the `embeddings` feature to be enabled.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddingConfig KREUZBERGEmbeddingConfig;
|
|
/**
|
|
* Embedding model types supported by Kreuzberg.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddingModelType KREUZBERGEmbeddingModelType;
|
|
/**
|
|
* Preset configurations for common RAG use cases.
|
|
*
|
|
* Each preset combines chunk size, overlap, and embedding model
|
|
* to provide an optimized configuration for specific scenarios.
|
|
*
|
|
* All string fields are owned `String` for FFI compatibility â instances
|
|
* are safe to clone and pass across language boundaries.
|
|
*/
|
|
typedef struct KREUZBERGEmbeddingPreset KREUZBERGEmbeddingPreset;
|
|
/**
|
|
* EPUB metadata (Dublin Core extensions).
|
|
*/
|
|
typedef struct KREUZBERGEpubMetadata KREUZBERGEpubMetadata;
|
|
/**
|
|
* Error metadata (for batch operations).
|
|
*/
|
|
typedef struct KREUZBERGErrorMetadata KREUZBERGErrorMetadata;
|
|
/**
|
|
* Excel/spreadsheet format metadata.
|
|
*
|
|
* Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
|
* discriminant. Sheet count and sheet names are stored inside this struct.
|
|
*/
|
|
typedef struct KREUZBERGExcelMetadata KREUZBERGExcelMetadata;
|
|
/**
|
|
* Single Excel worksheet.
|
|
*
|
|
* Represents one sheet from an Excel workbook with its content
|
|
* converted to Markdown format and dimensional statistics.
|
|
*/
|
|
typedef struct KREUZBERGExcelSheet KREUZBERGExcelSheet;
|
|
/**
|
|
* Excel workbook representation.
|
|
*
|
|
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|
* extracted content and metadata.
|
|
*/
|
|
typedef struct KREUZBERGExcelWorkbook KREUZBERGExcelWorkbook;
|
|
/**
|
|
* ONNX Runtime execution provider type.
|
|
*
|
|
* Determines which hardware backend is used for model inference.
|
|
* `Auto` (default) selects the best available provider per platform.
|
|
*/
|
|
typedef struct KREUZBERGExecutionProviderType KREUZBERGExecutionProviderType;
|
|
/**
|
|
* Extracted image from a document.
|
|
*
|
|
* Contains raw image data, metadata, and optional nested OCR results.
|
|
* Raw bytes allow cross-language compatibility - users can convert to
|
|
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
*/
|
|
typedef struct KREUZBERGExtractedImage KREUZBERGExtractedImage;
|
|
/**
|
|
* A URI extracted from a document.
|
|
*
|
|
* Represents any link, reference, or resource pointer found during extraction.
|
|
* The `kind` field classifies the URI semantically, while `label` carries
|
|
* optional human-readable display text.
|
|
*/
|
|
typedef struct KREUZBERGExtractedUri KREUZBERGExtractedUri;
|
|
/**
|
|
* Main extraction configuration.
|
|
*
|
|
* This struct contains all configuration options for the extraction process.
|
|
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|
* \\code
|
|
* use kreuzberg::core::config::ExtractionConfig;
|
|
*
|
|
* // Create with defaults
|
|
* let config = ExtractionConfig::default();
|
|
*
|
|
* // Load from TOML file
|
|
* // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGExtractionConfig KREUZBERGExtractionConfig;
|
|
/**
|
|
* The complete diff between two `ExtractionResult` values.
|
|
*/
|
|
typedef struct KREUZBERGExtractionDiff KREUZBERGExtractionDiff;
|
|
/**
|
|
* How the extracted text was produced.
|
|
*/
|
|
typedef struct KREUZBERGExtractionMethod KREUZBERGExtractionMethod;
|
|
/**
|
|
* General extraction result used by the core extraction API.
|
|
*
|
|
* This is the main result type returned by all extraction functions.
|
|
*/
|
|
typedef struct KREUZBERGExtractionResult KREUZBERGExtractionResult;
|
|
/**
|
|
* FictionBook (FB2) metadata.
|
|
*/
|
|
typedef struct KREUZBERGFictionBookMetadata KREUZBERGFictionBookMetadata;
|
|
/**
|
|
* Per-file extraction configuration overrides for batch processing.
|
|
*
|
|
* All fields are `Option<T>` â `None` means "use the batch-level default."
|
|
* This type is used with `batch_extract_files` and
|
|
* `batch_extract_bytes` to allow heterogeneous
|
|
* extraction settings within a single batch.
|
|
*
|
|
* # Excluded Fields
|
|
*
|
|
* The following `ExtractionConfig` fields are batch-level only and
|
|
* cannot be overridden per file:
|
|
* - `max_concurrent_extractions` â controls batch parallelism
|
|
* - `use_cache` â global caching policy
|
|
* - `acceleration` â shared ONNX execution provider
|
|
* - `security_limits` â global archive security policy
|
|
* \\code
|
|
* use kreuzberg::FileExtractionConfig;
|
|
*
|
|
* // Override just OCR forcing for a specific file
|
|
* let config = FileExtractionConfig {
|
|
* force_ocr: Some(true),
|
|
* ..Default::default()
|
|
* };
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGFileExtractionConfig KREUZBERGFileExtractionConfig;
|
|
/**
|
|
* Footnote in Djot.
|
|
*/
|
|
typedef struct KREUZBERGFootnote KREUZBERGFootnote;
|
|
/**
|
|
* Format-specific metadata (discriminated union).
|
|
*
|
|
* Only one format type can exist per extraction result. This provides
|
|
* type-safe, clean metadata without nested optionals.
|
|
*/
|
|
typedef struct KREUZBERGFormatMetadata KREUZBERGFormatMetadata;
|
|
/**
|
|
* Block-level element in a Djot document.
|
|
*
|
|
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
|
*/
|
|
typedef struct KREUZBERGFormattedBlock KREUZBERGFormattedBlock;
|
|
/**
|
|
* Individual grid cell with position and span metadata.
|
|
*/
|
|
typedef struct KREUZBERGGridCell KREUZBERGGridCell;
|
|
/**
|
|
* Header/heading element metadata.
|
|
*/
|
|
typedef struct KREUZBERGHeaderMetadata KREUZBERGHeaderMetadata;
|
|
/**
|
|
* Heading context for a chunk within a Markdown document.
|
|
*
|
|
* Contains the heading hierarchy from document root to this chunk's section.
|
|
*/
|
|
typedef struct KREUZBERGHeadingContext KREUZBERGHeadingContext;
|
|
/**
|
|
* A single heading in the hierarchy.
|
|
*/
|
|
typedef struct KREUZBERGHeadingLevel KREUZBERGHeadingLevel;
|
|
/**
|
|
* A text block with hierarchy level assignment.
|
|
*
|
|
* Represents a block of text with semantic heading information extracted from
|
|
* font size clustering and hierarchical analysis.
|
|
*/
|
|
typedef struct KREUZBERGHierarchicalBlock KREUZBERGHierarchicalBlock;
|
|
/**
|
|
* Hierarchy extraction configuration for PDF text structure analysis.
|
|
*
|
|
* Enables extraction of document hierarchy levels (H1-H6) based on font size
|
|
* clustering and semantic analysis. When enabled, hierarchical blocks are
|
|
* included in page content.
|
|
*/
|
|
typedef struct KREUZBERGHierarchyConfig KREUZBERGHierarchyConfig;
|
|
/**
|
|
* HTML metadata extracted from HTML documents.
|
|
*
|
|
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|
* and extracted structural elements (headers, links, images, structured data).
|
|
*/
|
|
typedef struct KREUZBERGHtmlMetadata KREUZBERGHtmlMetadata;
|
|
/**
|
|
* Configuration for styled HTML output.
|
|
*
|
|
* When set on [`ExtractionConfig::html_output`] alongside
|
|
* `output_format = OutputFormat::Html`, the pipeline builds a
|
|
* `StyledHtmlRenderer` (crate::rendering::StyledHtmlRenderer) instead of
|
|
* the plain comrak-based renderer.
|
|
* \\code
|
|
* use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
|
|
*
|
|
* let config = HtmlOutputConfig {
|
|
* theme: HtmlTheme::GitHub,
|
|
* css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
|
* ..Default::default()
|
|
* };
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGHtmlOutputConfig KREUZBERGHtmlOutputConfig;
|
|
/**
|
|
* Built-in HTML theme selection.
|
|
*/
|
|
typedef struct KREUZBERGHtmlTheme KREUZBERGHtmlTheme;
|
|
/**
|
|
* Image extraction configuration.
|
|
*/
|
|
typedef struct KREUZBERGImageExtractionConfig KREUZBERGImageExtractionConfig;
|
|
/**
|
|
* Heuristic classification of what an image likely depicts.
|
|
*/
|
|
typedef struct KREUZBERGImageKind KREUZBERGImageKind;
|
|
/**
|
|
* Image metadata extracted from image files.
|
|
*
|
|
* Includes dimensions, format, and EXIF data.
|
|
*/
|
|
typedef struct KREUZBERGImageMetadata KREUZBERGImageMetadata;
|
|
/**
|
|
* Image element metadata.
|
|
*/
|
|
typedef struct KREUZBERGImageMetadataType KREUZBERGImageMetadataType;
|
|
/**
|
|
* Image preprocessing configuration for OCR.
|
|
*
|
|
* These settings control how images are preprocessed before OCR to improve
|
|
* text recognition quality. Different preprocessing strategies work better
|
|
* for different document types.
|
|
*/
|
|
typedef struct KREUZBERGImagePreprocessingConfig KREUZBERGImagePreprocessingConfig;
|
|
/**
|
|
* Image preprocessing metadata.
|
|
*
|
|
* Tracks the transformations applied to an image during OCR preprocessing,
|
|
* including DPI normalization, resizing, and resampling.
|
|
*/
|
|
typedef struct KREUZBERGImagePreprocessingMetadata KREUZBERGImagePreprocessingMetadata;
|
|
/**
|
|
* Image type classification.
|
|
*/
|
|
typedef struct KREUZBERGImageType KREUZBERGImageType;
|
|
/**
|
|
* Inline element within a block.
|
|
*
|
|
* Represents text with formatting, links, images, etc.
|
|
*/
|
|
typedef struct KREUZBERGInlineElement KREUZBERGInlineElement;
|
|
/**
|
|
* Types of inline elements in Djot.
|
|
*/
|
|
typedef struct KREUZBERGInlineType KREUZBERGInlineType;
|
|
/**
|
|
* JATS (Journal Article Tag Suite) metadata.
|
|
*/
|
|
typedef struct KREUZBERGJatsMetadata KREUZBERGJatsMetadata;
|
|
/**
|
|
* Extracted keyword with metadata.
|
|
*/
|
|
typedef struct KREUZBERGKeyword KREUZBERGKeyword;
|
|
/**
|
|
* Keyword algorithm selection.
|
|
*/
|
|
typedef struct KREUZBERGKeywordAlgorithm KREUZBERGKeywordAlgorithm;
|
|
/**
|
|
* Keyword extraction configuration.
|
|
*/
|
|
typedef struct KREUZBERGKeywordConfig KREUZBERGKeywordConfig;
|
|
/**
|
|
* Language detection configuration.
|
|
*/
|
|
typedef struct KREUZBERGLanguageDetectionConfig KREUZBERGLanguageDetectionConfig;
|
|
/**
|
|
* The 17 canonical document layout classes.
|
|
*
|
|
* All model backends (RT-DETR, YOLO, etc.) map their native class IDs
|
|
* to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
|
|
* map to the closest equivalent.
|
|
*
|
|
* Wire format is snake_case in all serializers (JSON, TOML, YAML).
|
|
*/
|
|
typedef struct KREUZBERGLayoutClass KREUZBERGLayoutClass;
|
|
/**
|
|
* A single layout detection result.
|
|
*/
|
|
typedef struct KREUZBERGLayoutDetection KREUZBERGLayoutDetection;
|
|
/**
|
|
* Layout detection configuration.
|
|
*
|
|
* Controls layout detection behavior in the extraction pipeline.
|
|
* When set on `ExtractionConfig` (super::ExtractionConfig), layout detection
|
|
* is enabled for PDF extraction.
|
|
*/
|
|
typedef struct KREUZBERGLayoutDetectionConfig KREUZBERGLayoutDetectionConfig;
|
|
/**
|
|
* A detected layout region on a page.
|
|
*
|
|
* When layout detection is enabled, each page may have layout regions
|
|
* identifying different content types (text, pictures, tables, etc.)
|
|
* with confidence scores and spatial positions.
|
|
*/
|
|
typedef struct KREUZBERGLayoutRegion KREUZBERGLayoutRegion;
|
|
/**
|
|
* Link element metadata.
|
|
*/
|
|
typedef struct KREUZBERGLinkMetadata KREUZBERGLinkMetadata;
|
|
/**
|
|
* Link type classification.
|
|
*/
|
|
typedef struct KREUZBERGLinkType KREUZBERGLinkType;
|
|
/**
|
|
* Type of list detection.
|
|
*/
|
|
typedef struct KREUZBERGListType KREUZBERGListType;
|
|
/**
|
|
* Configuration for an LLM provider/model via liter-llm.
|
|
*
|
|
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
|
* its own `LlmConfig`, allowing different providers per feature.
|
|
* \\code
|
|
* [structured_extraction.llm]
|
|
* model = "openai/gpt-4o"
|
|
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGLlmConfig KREUZBERGLlmConfig;
|
|
/**
|
|
* Token usage and cost data for a single LLM call made during extraction.
|
|
*
|
|
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
|
* are used. Multiple entries may be present when multiple LLM calls occur
|
|
* within one extraction (e.g. VLM OCR + structured extraction).
|
|
*/
|
|
typedef struct KREUZBERGLlmUsage KREUZBERGLlmUsage;
|
|
/**
|
|
* Extraction result metadata.
|
|
*
|
|
* Contains common fields applicable to all formats, format-specific metadata
|
|
* via a discriminated union, and additional custom fields from postprocessors.
|
|
*/
|
|
typedef struct KREUZBERGMetadata KREUZBERGMetadata;
|
|
/**
|
|
* Combined paths to all models needed for OCR (backward compatibility).
|
|
*/
|
|
typedef struct KREUZBERGModelPaths KREUZBERGModelPaths;
|
|
/**
|
|
* Tagged enum for node content. Each variant carries only type-specific data.
|
|
*
|
|
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
|
* Go/Java/TypeScript bindings.
|
|
*/
|
|
typedef struct KREUZBERGNodeContent KREUZBERGNodeContent;
|
|
/**
|
|
* Trait for OCR backend plugins.
|
|
*
|
|
* Implement this trait to add custom OCR capabilities. OCR backends can be:
|
|
* - Native Rust implementations (like Tesseract)
|
|
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
|
|
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
|
|
* \\code
|
|
* use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
|
|
* use kreuzberg::{Result, OcrConfig};
|
|
* use async_trait::async_trait;
|
|
* use std::borrow::Cow;
|
|
* use std::path::Path;
|
|
* use kreuzberg::types::{ExtractionResult, Metadata};
|
|
*
|
|
* struct CustomOcrBackend;
|
|
*
|
|
* impl Plugin for CustomOcrBackend {
|
|
* fn name(&self) -> &str { "custom-ocr" }
|
|
* fn version(&self) -> String { "1.0.0".to_string() }
|
|
* fn initialize(&self) -> Result<()> { Ok(()) }
|
|
* fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
* }
|
|
*
|
|
* #[async_trait]
|
|
* impl OcrBackend for CustomOcrBackend {
|
|
* async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
|
* // Implement OCR logic here
|
|
* Ok(ExtractionResult {
|
|
* content: "Extracted text".to_string(),
|
|
* mime_type: Cow::Borrowed("text/plain"),
|
|
* ..Default::default()
|
|
* })
|
|
* }
|
|
*
|
|
* async fn process_image_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
|
|
* let bytes = std::fs::read(path)?;
|
|
* self.process_image(&bytes, config).await
|
|
* }
|
|
*
|
|
* fn supports_language(&self, lang: &str) -> bool {
|
|
* matches!(lang, "eng" | "deu" | "fra")
|
|
* }
|
|
*
|
|
* fn backend_type(&self) -> OcrBackendType {
|
|
* OcrBackendType::Custom
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGOcrBackend KREUZBERGOcrBackend;
|
|
/**
|
|
* OCR backend types.
|
|
*/
|
|
typedef struct KREUZBERGOcrBackendType KREUZBERGOcrBackendType;
|
|
/**
|
|
* Bounding geometry for an OCR element.
|
|
*
|
|
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
|
* (from PaddleOCR and rotated text detection).
|
|
*/
|
|
typedef struct KREUZBERGOcrBoundingGeometry KREUZBERGOcrBoundingGeometry;
|
|
/**
|
|
* Confidence scores for an OCR element.
|
|
*
|
|
* Separates detection confidence (how confident that text exists at this location)
|
|
* from recognition confidence (how confident about the actual text content).
|
|
*/
|
|
typedef struct KREUZBERGOcrConfidence KREUZBERGOcrConfidence;
|
|
/**
|
|
* OCR configuration.
|
|
*/
|
|
typedef struct KREUZBERGOcrConfig KREUZBERGOcrConfig;
|
|
/**
|
|
* A unified OCR element representing detected text with full metadata.
|
|
*
|
|
* This is the primary type for structured OCR output, preserving all information
|
|
* from both Tesseract and PaddleOCR backends.
|
|
*/
|
|
typedef struct KREUZBERGOcrElement KREUZBERGOcrElement;
|
|
/**
|
|
* Configuration for OCR element extraction.
|
|
*
|
|
* Controls how OCR elements are extracted and filtered.
|
|
*/
|
|
typedef struct KREUZBERGOcrElementConfig KREUZBERGOcrElementConfig;
|
|
/**
|
|
* Hierarchical level of an OCR element.
|
|
*
|
|
* Maps to Tesseract's page segmentation hierarchy and provides
|
|
* equivalent semantics for PaddleOCR.
|
|
*/
|
|
typedef struct KREUZBERGOcrElementLevel KREUZBERGOcrElementLevel;
|
|
/**
|
|
* OCR extraction result.
|
|
*
|
|
* Result of performing OCR on an image or scanned document,
|
|
* including recognized text and detected tables.
|
|
*/
|
|
typedef struct KREUZBERGOcrExtractionResult KREUZBERGOcrExtractionResult;
|
|
/**
|
|
* OCR processing metadata.
|
|
*
|
|
* Captures information about OCR processing configuration and results.
|
|
*/
|
|
typedef struct KREUZBERGOcrMetadata KREUZBERGOcrMetadata;
|
|
/**
|
|
* Multi-backend OCR pipeline with quality-based fallback.
|
|
*
|
|
* Backends are tried in priority order (highest first). After each backend
|
|
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
|
|
* the result is accepted. Otherwise the next backend is tried.
|
|
*/
|
|
typedef struct KREUZBERGOcrPipelineConfig KREUZBERGOcrPipelineConfig;
|
|
/**
|
|
* A single backend stage in the OCR pipeline.
|
|
*/
|
|
typedef struct KREUZBERGOcrPipelineStage KREUZBERGOcrPipelineStage;
|
|
/**
|
|
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
|
|
*
|
|
* All fields default to the values that match the previous hardcoded behavior,
|
|
* so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
|
*/
|
|
typedef struct KREUZBERGOcrQualityThresholds KREUZBERGOcrQualityThresholds;
|
|
/**
|
|
* Rotation information for an OCR element.
|
|
*/
|
|
typedef struct KREUZBERGOcrRotation KREUZBERGOcrRotation;
|
|
/**
|
|
* Table detected via OCR.
|
|
*
|
|
* Represents a table structure recognized during OCR processing.
|
|
*/
|
|
typedef struct KREUZBERGOcrTable KREUZBERGOcrTable;
|
|
/**
|
|
* Bounding box for an OCR-detected table in pixel coordinates.
|
|
*/
|
|
typedef struct KREUZBERGOcrTableBoundingBox KREUZBERGOcrTableBoundingBox;
|
|
/**
|
|
* Document orientation detection result.
|
|
*/
|
|
typedef struct KREUZBERGOrientationResult KREUZBERGOrientationResult;
|
|
/**
|
|
* Output format for extraction results.
|
|
*
|
|
* Controls the format of the `content` field in `ExtractionResult`.
|
|
* When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
|
* `Plain` returns the raw extracted text.
|
|
* `Structured` returns JSON with full OCR element data including bounding
|
|
* boxes and confidence scores.
|
|
*/
|
|
typedef struct KREUZBERGOutputFormat KREUZBERGOutputFormat;
|
|
/**
|
|
* Page Segmentation Mode for Tesseract OCR
|
|
*/
|
|
typedef struct KREUZBERGPSMMode KREUZBERGPSMMode;
|
|
/**
|
|
* Supported languages in PaddleOCR.
|
|
*
|
|
* Maps user-friendly language codes to paddle-ocr-rs language identifiers.
|
|
*/
|
|
typedef struct KREUZBERGPaddleLanguage KREUZBERGPaddleLanguage;
|
|
/**
|
|
* Configuration for PaddleOCR backend.
|
|
*
|
|
* Configures PaddleOCR text detection and recognition with multi-language support.
|
|
* Uses a builder pattern for convenient configuration.
|
|
* \\code
|
|
* use kreuzberg::PaddleOcrConfig;
|
|
*
|
|
* // Create with default English configuration
|
|
* let config = PaddleOcrConfig::new("en");
|
|
*
|
|
* // Create with custom cache directory
|
|
* let config = PaddleOcrConfig::new("ch")
|
|
* .with_cache_dir("/path/to/cache".into());
|
|
*
|
|
* // Enable table detection
|
|
* let config = PaddleOcrConfig::new("en")
|
|
* .with_table_detection(true);
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGPaddleOcrConfig KREUZBERGPaddleOcrConfig;
|
|
/**
|
|
* Byte offset boundary for a page.
|
|
*
|
|
* Tracks where a specific page's content starts and ends in the main content string,
|
|
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
*/
|
|
typedef struct KREUZBERGPageBoundary KREUZBERGPageBoundary;
|
|
/**
|
|
* Page extraction and tracking configuration.
|
|
*
|
|
* Controls how pages are extracted, tracked, and represented in the extraction results.
|
|
* When `None`, page tracking is disabled.
|
|
*
|
|
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
|
* when page boundaries are available and chunking is configured.
|
|
*/
|
|
typedef struct KREUZBERGPageConfig KREUZBERGPageConfig;
|
|
/**
|
|
* Content for a single page/slide.
|
|
*
|
|
* When page extraction is enabled, documents are split into per-page content
|
|
* with associated tables and images mapped to each page.
|
|
*
|
|
* # Performance
|
|
*
|
|
* Uses Arc-wrapped tables and images for memory efficiency:
|
|
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|
*
|
|
* This reduces memory overhead for documents with shared tables/images
|
|
* by avoiding redundant copies during serialization.
|
|
*/
|
|
typedef struct KREUZBERGPageContent KREUZBERGPageContent;
|
|
/**
|
|
* Page hierarchy structure containing heading levels and block information.
|
|
*
|
|
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|
* blocks with heading levels (H1-H6) for semantic document structure.
|
|
*/
|
|
typedef struct KREUZBERGPageHierarchy KREUZBERGPageHierarchy;
|
|
/**
|
|
* Metadata for individual page/slide/sheet.
|
|
*
|
|
* Captures per-page information including dimensions, content counts,
|
|
* and visibility state (for presentations).
|
|
*/
|
|
typedef struct KREUZBERGPageInfo KREUZBERGPageInfo;
|
|
/**
|
|
* Unified page structure for documents.
|
|
*
|
|
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
* with character offset boundaries for chunk-to-page mapping.
|
|
*/
|
|
typedef struct KREUZBERGPageStructure KREUZBERGPageStructure;
|
|
/**
|
|
* Type of paginated unit in a document.
|
|
*
|
|
* Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet
|
|
* sheets).
|
|
*/
|
|
typedef struct KREUZBERGPageUnitType KREUZBERGPageUnitType;
|
|
/**
|
|
* A PDF annotation extracted from a document page.
|
|
*/
|
|
typedef struct KREUZBERGPdfAnnotation KREUZBERGPdfAnnotation;
|
|
/**
|
|
* Type of PDF annotation.
|
|
*/
|
|
typedef struct KREUZBERGPdfAnnotationType KREUZBERGPdfAnnotationType;
|
|
/**
|
|
* PDF-specific configuration.
|
|
*/
|
|
typedef struct KREUZBERGPdfConfig KREUZBERGPdfConfig;
|
|
/**
|
|
* PDF-specific metadata.
|
|
*
|
|
* Contains metadata fields specific to PDF documents that are not in the common
|
|
* `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|
* are at the `Metadata` level.
|
|
*/
|
|
typedef struct KREUZBERGPdfMetadata KREUZBERGPdfMetadata;
|
|
/**
|
|
* Base trait that all plugins must implement.
|
|
*
|
|
* This trait provides common functionality for plugin lifecycle management,
|
|
* identification, and metadata.
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* All plugins must be `Send + Sync` to support concurrent usage across threads.
|
|
* \\code
|
|
* use kreuzberg::plugins::Plugin;
|
|
* use kreuzberg::Result;
|
|
* use std::sync::atomic::{AtomicBool, Ordering};
|
|
*
|
|
* struct MyPlugin {
|
|
* initialized: AtomicBool,
|
|
* }
|
|
*
|
|
* impl Plugin for MyPlugin {
|
|
* fn name(&self) -> &str {
|
|
* "my-plugin"
|
|
* }
|
|
*
|
|
* fn version(&self) -> String {
|
|
* "1.0.0".to_string()
|
|
* }
|
|
*
|
|
* fn initialize(&self) -> Result<()> {
|
|
* self.initialized.store(true, Ordering::Release);
|
|
* println!("Plugin initialized!");
|
|
* Ok(())
|
|
* }
|
|
*
|
|
* fn shutdown(&self) -> Result<()> {
|
|
* self.initialized.store(false, Ordering::Release);
|
|
* println!("Plugin shutdown!");
|
|
* Ok(())
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGPlugin KREUZBERGPlugin;
|
|
/**
|
|
* Trait for post-processor plugins.
|
|
*
|
|
* Post-processors transform or enrich extraction results after the initial
|
|
* extraction is complete. They can:
|
|
* - Clean and normalize text
|
|
* - Add metadata (language, keywords, entities)
|
|
* - Split content into chunks
|
|
* - Score quality
|
|
* - Apply custom transformations
|
|
*
|
|
* # Processing Order
|
|
*
|
|
* Post-processors are executed in stage order:
|
|
* 1. **Early** - Language detection, entity extraction
|
|
* 2. **Middle** - Keyword extraction, token reduction
|
|
* 3. **Late** - Custom hooks, final validation
|
|
*
|
|
* Within each stage, processors are executed in registration order.
|
|
*
|
|
* # Error Handling
|
|
*
|
|
* Post-processor errors are non-fatal by default - they're captured in metadata
|
|
* and execution continues. To make errors fatal, return an error from `process()`.
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* Post-processors must be thread-safe (`Send + Sync`).
|
|
* \\code
|
|
* use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
* use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
|
* use async_trait::async_trait;
|
|
*
|
|
* /// Add word count metadata to extraction results
|
|
* struct WordCountProcessor;
|
|
*
|
|
* impl Plugin for WordCountProcessor {
|
|
* fn name(&self) -> &str { "word-count" }
|
|
* fn version(&self) -> String { "1.0.0".to_string() }
|
|
* fn initialize(&self) -> Result<()> { Ok(()) }
|
|
* fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
* }
|
|
*
|
|
* #[async_trait]
|
|
* impl PostProcessor for WordCountProcessor {
|
|
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
|
|
* -> Result<()> {
|
|
* // Count words
|
|
* let word_count = result.content.split_whitespace().count();
|
|
*
|
|
* // Add to metadata
|
|
* result.metadata.additional.insert("word_count".to_string().into(), serde_json::json!(word_count));
|
|
*
|
|
* Ok(())
|
|
* }
|
|
*
|
|
* fn processing_stage(&self) -> ProcessingStage {
|
|
* ProcessingStage::Early
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGPostProcessor KREUZBERGPostProcessor;
|
|
/**
|
|
* Post-processor configuration.
|
|
*/
|
|
typedef struct KREUZBERGPostProcessorConfig KREUZBERGPostProcessorConfig;
|
|
/**
|
|
* Application properties from docProps/app.xml for PPTX
|
|
*
|
|
* Contains PowerPoint-specific document metadata.
|
|
*/
|
|
typedef struct KREUZBERGPptxAppProperties KREUZBERGPptxAppProperties;
|
|
/**
|
|
* PowerPoint (PPTX) extraction result.
|
|
*
|
|
* Contains extracted slide content, metadata, and embedded images/tables.
|
|
*/
|
|
typedef struct KREUZBERGPptxExtractionResult KREUZBERGPptxExtractionResult;
|
|
/**
|
|
* PowerPoint presentation metadata.
|
|
*
|
|
* Extracted from PPTX files containing slide counts and presentation details.
|
|
*/
|
|
typedef struct KREUZBERGPptxMetadata KREUZBERGPptxMetadata;
|
|
/**
|
|
* Processing stages for post-processors.
|
|
*
|
|
* Post-processors are executed in stage order (Early â Middle â Late).
|
|
* Use stages to control the order of post-processing operations.
|
|
*/
|
|
typedef struct KREUZBERGProcessingStage KREUZBERGProcessingStage;
|
|
/**
|
|
* A non-fatal warning from a processing pipeline stage.
|
|
*
|
|
* Captures errors from optional features that don't prevent extraction
|
|
* but may indicate degraded results.
|
|
*/
|
|
typedef struct KREUZBERGProcessingWarning KREUZBERGProcessingWarning;
|
|
/**
|
|
* Outlook PST archive metadata.
|
|
*/
|
|
typedef struct KREUZBERGPstMetadata KREUZBERGPstMetadata;
|
|
/**
|
|
* RAKE-specific parameters.
|
|
*/
|
|
typedef struct KREUZBERGRakeParams KREUZBERGRakeParams;
|
|
/**
|
|
* Pre-computed table markdown for a table detection region.
|
|
*
|
|
* Produced by the TATR-based table structure recognizer and surfaced as part of
|
|
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
|
|
* so that consumers who do not enable `layout-detection` (ORT) can still reference
|
|
* the type in their own code.
|
|
*/
|
|
typedef struct KREUZBERGRecognizedTable KREUZBERGRecognizedTable;
|
|
typedef struct KREUZBERGReductionLevel KREUZBERGReductionLevel;
|
|
/**
|
|
* Semantic kind of a relationship between document elements.
|
|
*/
|
|
typedef struct KREUZBERGRelationshipKind KREUZBERGRelationshipKind;
|
|
/**
|
|
* Trait for document renderers that convert [`InternalDocument`] to output strings.
|
|
*
|
|
* Renderers are typically stateless converters that transform the internal
|
|
* document representation into a specific output format (Markdown, HTML,
|
|
* Djot, plain text, etc.). They participate in the standard [`Plugin`]
|
|
* lifecycle so custom renderers can be registered from any supported binding
|
|
* language.
|
|
*
|
|
* The format name is exposed via [`Plugin::name`]. For stateless renderers
|
|
* the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
|
|
* take no-op defaults and need not be overridden.
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* Renderers must be `Send + Sync` (inherited from [`Plugin`]).
|
|
* \\code
|
|
* use kreuzberg::plugins::{Plugin, Renderer};
|
|
* use kreuzberg::types::internal::InternalDocument;
|
|
* use kreuzberg::Result;
|
|
*
|
|
* struct CustomRenderer;
|
|
*
|
|
* impl Plugin for CustomRenderer {
|
|
* fn name(&self) -> &str { "custom" }
|
|
* }
|
|
*
|
|
* impl Renderer for CustomRenderer {
|
|
* fn render(&self, doc: &InternalDocument) -> Result<String> {
|
|
* Ok(format!("Custom output with {} elements", doc.elements.len()))
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGRenderer KREUZBERGRenderer;
|
|
/**
|
|
* Result-shape selection for extraction results.
|
|
*
|
|
* Distinct from `OutputFormat` (which controls rendering â Plain, Markdown,
|
|
* HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
|
|
* blob vs. an element-based decomposition.
|
|
*/
|
|
typedef struct KREUZBERGResultFormat KREUZBERGResultFormat;
|
|
/**
|
|
* Best-effort document location for a revision.
|
|
*/
|
|
typedef struct KREUZBERGRevisionAnchor KREUZBERGRevisionAnchor;
|
|
/**
|
|
* The content changes that make up a single revision.
|
|
*
|
|
* For insertions and deletions the `content` field carries the added/removed
|
|
* lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
|
|
* changes, `content` is empty â the property diff is left as a TODO for a
|
|
* later enrichment pass.
|
|
*/
|
|
typedef struct KREUZBERGRevisionDelta KREUZBERGRevisionDelta;
|
|
/**
|
|
* Semantic classification of a tracked change.
|
|
*/
|
|
typedef struct KREUZBERGRevisionKind KREUZBERGRevisionKind;
|
|
/**
|
|
* Configuration for security limits across extractors.
|
|
*
|
|
* All limits are intentionally conservative to prevent DoS attacks
|
|
* while still supporting legitimate documents.
|
|
*/
|
|
typedef struct KREUZBERGSecurityLimits KREUZBERGSecurityLimits;
|
|
/**
|
|
* API server configuration.
|
|
*
|
|
* This struct holds all configuration options for the Kreuzberg API server,
|
|
* including host/port settings, CORS configuration, and upload limits.
|
|
*
|
|
* # Defaults
|
|
*
|
|
* - `host`: "127.0.0.1" (localhost only)
|
|
* - `port`: 8000
|
|
* - `cors_origins`: empty vector (allows all origins)
|
|
* - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|
*/
|
|
typedef struct KREUZBERGServerConfig KREUZBERGServerConfig;
|
|
/**
|
|
* Structured data (Schema.org, microdata, RDFa) block.
|
|
*/
|
|
typedef struct KREUZBERGStructuredData KREUZBERGStructuredData;
|
|
typedef struct KREUZBERGStructuredDataResult KREUZBERGStructuredDataResult;
|
|
/**
|
|
* Structured data type classification.
|
|
*/
|
|
typedef struct KREUZBERGStructuredDataType KREUZBERGStructuredDataType;
|
|
/**
|
|
* Configuration for LLM-based structured data extraction.
|
|
*
|
|
* Sends extracted document content to a VLM with a JSON schema,
|
|
* returning structured data that conforms to the schema.
|
|
* \\code
|
|
* [structured_extraction]
|
|
* schema_name = "invoice_data"
|
|
* strict = true
|
|
*
|
|
* [structured_extraction.schema]
|
|
* type = "object"
|
|
* properties.vendor = { type = "string" }
|
|
* properties.total = { type = "number" }
|
|
* required = ["vendor", "total"]
|
|
*
|
|
* [structured_extraction.llm]
|
|
* model = "openai/gpt-4o"
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGStructuredExtractionConfig KREUZBERGStructuredExtractionConfig;
|
|
/**
|
|
* A supported document format entry.
|
|
*
|
|
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
|
*/
|
|
typedef struct KREUZBERGSupportedFormat KREUZBERGSupportedFormat;
|
|
/**
|
|
* Extracted table structure.
|
|
*
|
|
* Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
* Tables are converted to both structured cell data and Markdown format.
|
|
*/
|
|
typedef struct KREUZBERGTable KREUZBERGTable;
|
|
/**
|
|
* Individual table cell with content and optional styling.
|
|
*
|
|
* Future extension point for rich table support with cell-level metadata.
|
|
*/
|
|
typedef struct KREUZBERGTableCell KREUZBERGTableCell;
|
|
/**
|
|
* Cell-level changes for a pair of tables that share the same index.
|
|
*/
|
|
typedef struct KREUZBERGTableDiff KREUZBERGTableDiff;
|
|
/**
|
|
* Structured table grid with cell-level metadata.
|
|
*
|
|
* Stores row/column dimensions and a flat list of cells with position info.
|
|
*/
|
|
typedef struct KREUZBERGTableGrid KREUZBERGTableGrid;
|
|
/**
|
|
* Which table structure recognition model to use.
|
|
*
|
|
* Controls the model used for table cell detection within layout-detected
|
|
* table regions. Wire format is snake_case in all serializers (JSON, TOML,
|
|
* YAML).
|
|
*/
|
|
typedef struct KREUZBERGTableModel KREUZBERGTableModel;
|
|
/**
|
|
* Tesseract OCR configuration.
|
|
*
|
|
* Provides fine-grained control over Tesseract OCR engine parameters.
|
|
* Most users can use the defaults, but these settings allow optimization
|
|
* for specific document types (invoices, handwriting, etc.).
|
|
*/
|
|
typedef struct KREUZBERGTesseractConfig KREUZBERGTesseractConfig;
|
|
/**
|
|
* Inline text annotation â byte-range based formatting and links.
|
|
*
|
|
* Annotations reference byte offsets into the node's text content,
|
|
* enabling precise identification of formatted regions.
|
|
*/
|
|
typedef struct KREUZBERGTextAnnotation KREUZBERGTextAnnotation;
|
|
/**
|
|
* Text direction enumeration for HTML documents.
|
|
*/
|
|
typedef struct KREUZBERGTextDirection KREUZBERGTextDirection;
|
|
/**
|
|
* Plain text and Markdown extraction result.
|
|
*
|
|
* Contains the extracted text along with statistics and,
|
|
* for Markdown files, structural elements like headers and links.
|
|
*/
|
|
typedef struct KREUZBERGTextExtractionResult KREUZBERGTextExtractionResult;
|
|
/**
|
|
* Text/Markdown metadata.
|
|
*
|
|
* Extracted from plain text and Markdown files. Includes word counts and,
|
|
* for Markdown, structural elements like headers and links.
|
|
*/
|
|
typedef struct KREUZBERGTextMetadata KREUZBERGTextMetadata;
|
|
typedef struct KREUZBERGTokenReductionConfig KREUZBERGTokenReductionConfig;
|
|
/**
|
|
* Token reduction configuration.
|
|
*/
|
|
typedef struct KREUZBERGTokenReductionOptions KREUZBERGTokenReductionOptions;
|
|
/**
|
|
* Configuration for tree-sitter language pack integration.
|
|
*
|
|
* Controls grammar download behavior and code analysis options.
|
|
*
|
|
* # Example (TOML)
|
|
*
|
|
* ```toml
|
|
* [tree_sitter]
|
|
* languages = ["python", "rust"]
|
|
* groups = ["web"]
|
|
*
|
|
* [tree_sitter.process]
|
|
* structure = true
|
|
* comments = true
|
|
* docstrings = true
|
|
* ```
|
|
*/
|
|
typedef struct KREUZBERGTreeSitterConfig KREUZBERGTreeSitterConfig;
|
|
/**
|
|
* Processing options for tree-sitter code analysis.
|
|
*
|
|
* Controls which analysis features are enabled when extracting code files.
|
|
*/
|
|
typedef struct KREUZBERGTreeSitterProcessConfig KREUZBERGTreeSitterProcessConfig;
|
|
/**
|
|
* Semantic classification of an extracted URI.
|
|
*/
|
|
typedef struct KREUZBERGUriKind KREUZBERGUriKind;
|
|
/**
|
|
* Trait for validator plugins.
|
|
*
|
|
* Validators check extraction results for quality, completeness, or correctness.
|
|
* Unlike post-processors, validator errors **fail fast** - if a validator returns
|
|
* an error, the extraction fails immediately.
|
|
*
|
|
* # Use Cases
|
|
*
|
|
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
|
|
* - **Compliance**: Verify content meets regulatory requirements
|
|
* - **Content Filtering**: Reject documents containing unwanted content
|
|
* - **Format Validation**: Verify extracted content structure
|
|
* - **Security Checks**: Scan for malicious content
|
|
*
|
|
* # Error Handling
|
|
*
|
|
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
|
|
* to the caller. Use validators for hard requirements that must be met.
|
|
*
|
|
* For non-fatal checks, use post-processors instead.
|
|
*
|
|
* # Thread Safety
|
|
*
|
|
* Validators must be thread-safe (`Send + Sync`).
|
|
* \\code
|
|
* use kreuzberg::plugins::{Plugin, Validator};
|
|
* use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
* use async_trait::async_trait;
|
|
*
|
|
* /// Validate that extracted content has minimum length
|
|
* struct MinimumLengthValidator {
|
|
* min_length: usize,
|
|
* }
|
|
*
|
|
* impl Plugin for MinimumLengthValidator {
|
|
* fn name(&self) -> &str { "min-length-validator" }
|
|
* fn version(&self) -> String { "1.0.0".to_string() }
|
|
* fn initialize(&self) -> Result<()> { Ok(()) }
|
|
* fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
* }
|
|
*
|
|
* #[async_trait]
|
|
* impl Validator for MinimumLengthValidator {
|
|
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
|
* -> Result<()> {
|
|
* if result.content.len() < self.min_length {
|
|
* return Err(KreuzbergError::validation(format!(
|
|
* "Content too short: {} < {} characters",
|
|
* result.content.len(),
|
|
* self.min_length
|
|
* )));
|
|
* }
|
|
* Ok(())
|
|
* }
|
|
* }
|
|
* \\endcode
|
|
*/
|
|
typedef struct KREUZBERGValidator KREUZBERGValidator;
|
|
/**
|
|
* Application properties from docProps/app.xml for XLSX
|
|
*
|
|
* Contains Excel-specific document metadata.
|
|
*/
|
|
typedef struct KREUZBERGXlsxAppProperties KREUZBERGXlsxAppProperties;
|
|
/**
|
|
* XML extraction result.
|
|
*
|
|
* Contains extracted text content from XML files along with
|
|
* structural statistics about the XML document.
|
|
*/
|
|
typedef struct KREUZBERGXmlExtractionResult KREUZBERGXmlExtractionResult;
|
|
/**
|
|
* XML metadata extracted during XML parsing.
|
|
*
|
|
* Provides statistics about XML document structure.
|
|
*/
|
|
typedef struct KREUZBERGXmlMetadata KREUZBERGXmlMetadata;
|
|
/**
|
|
* YAKE-specific parameters.
|
|
*/
|
|
typedef struct KREUZBERGYakeParams KREUZBERGYakeParams;
|
|
/**
|
|
* Year range for bibliographic metadata.
|
|
*/
|
|
typedef struct KREUZBERGYearRange KREUZBERGYearRange;
|
|
"""
|
|
[defines]
|
|
"target_os = windows" = "SKIF_WINDOWS"
|
|
|
|
[export]
|
|
prefix = "KREUZBERG"
|
|
include = []
|
|
exclude = []
|
|
|
|
[fn]
|
|
args = "vertical"
|