Files
fil/crates/kreuzberg-ffi/cbindgen.toml
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1562 lines
52 KiB
TOML
Generated

# This file is auto-generated by alef. DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
language = "C"
include_guard = "KREUZBERG_H"
pragma_once = true
autogen_warning = "/* This file is auto-generated by alef. DO NOT EDIT. */"
after_includes = """
/* Opaque type forward declarations */
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
* \\code
* use kreuzberg::AccelerationConfig;
*
* // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
* let config = AccelerationConfig::default();
*
* // Force CPU only
* let config = AccelerationConfig {
* provider: kreuzberg::ExecutionProviderType::Cpu,
* ..Default::default()
* };
* \\endcode
*/
typedef struct KREUZBERGAccelerationConfig KREUZBERGAccelerationConfig;
/**
* Types of inline text annotations.
*/
typedef struct KREUZBERGAnnotationKind KREUZBERGAnnotationKind;
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
typedef struct KREUZBERGArchiveEntry KREUZBERGArchiveEntry;
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
typedef struct KREUZBERGArchiveMetadata KREUZBERGArchiveMetadata;
/**
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
*/
typedef struct KREUZBERGBBox KREUZBERGBBox;
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
typedef struct KREUZBERGBatchBytesItem KREUZBERGBatchBytesItem;
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
typedef struct KREUZBERGBatchFileItem KREUZBERGBatchFileItem;
/**
* BibTeX bibliography metadata.
*/
typedef struct KREUZBERGBibtexMetadata KREUZBERGBibtexMetadata;
/**
* Types of block-level elements in Djot.
*/
typedef struct KREUZBERGBlockType KREUZBERGBlockType;
/**
* Bounding box coordinates for element positioning.
*/
typedef struct KREUZBERGBoundingBox KREUZBERGBoundingBox;
typedef struct KREUZBERGCacheStats KREUZBERGCacheStats;
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
typedef struct KREUZBERGCellChange KREUZBERGCellChange;
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
typedef struct KREUZBERGChunk KREUZBERGChunk;
/**
* Metadata about a chunk's position in the original document.
*/
typedef struct KREUZBERGChunkMetadata KREUZBERGChunkMetadata;
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
typedef struct KREUZBERGChunkSizing KREUZBERGChunkSizing;
/**
* Semantic structural classification of a text chunk.
*
* Assigned by the heuristic classifier in `chunking::classifier`.
* Defaults to `Unknown` when no rule matches.
* Designed to be extended in future versions without breaking changes.
*/
typedef struct KREUZBERGChunkType KREUZBERGChunkType;
/**
* Type of text chunker to use.
*
* # Variants
*
* * `Text` - Generic text splitter, splits on whitespace and punctuation
* * `Markdown` - Markdown-aware splitter, preserves formatting and structure
* * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
* * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
* embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
* lower = more splits). Without an embedding, falls back to a
* structural-boundary heuristic (ALL-CAPS headers, numbered sections,
* blank-line paragraphs) and merges groups into chunks capped at
* `max_characters` (default 1000). `topic_threshold` has no effect in the
* fallback path. For best results, pair with an embedding model.
*/
typedef struct KREUZBERGChunkerType KREUZBERGChunkerType;
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..Default::default()` when constructing to allow for future field additions:
* ```rust
* let config = ChunkingConfig {
* max_characters: 500,
* ..Default::default()
* };
* ```
*/
typedef struct KREUZBERGChunkingConfig KREUZBERGChunkingConfig;
/**
* Citation file metadata (RIS, PubMed, EndNote).
*/
typedef struct KREUZBERGCitationMetadata KREUZBERGCitationMetadata;
/**
* Content rendering mode for code extraction.
*
* Controls how extracted code content is represented in the `content` field
* of `ExtractionResult`.
*/
typedef struct KREUZBERGCodeContentMode KREUZBERGCodeContentMode;
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `None` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
typedef struct KREUZBERGContentFilterConfig KREUZBERGContentFilterConfig;
/**
* Content layer classification for document nodes.
*
* Replaces separate body/furniture arrays with per-node granularity.
*/
typedef struct KREUZBERGContentLayer KREUZBERGContentLayer;
/**
* JATS contributor with role.
*/
typedef struct KREUZBERGContributorRole KREUZBERGContributorRole;
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
typedef struct KREUZBERGCoreProperties KREUZBERGCoreProperties;
/**
* CSV/TSV file metadata.
*/
typedef struct KREUZBERGCsvMetadata KREUZBERGCsvMetadata;
/**
* dBASE field information.
*/
typedef struct KREUZBERGDbfFieldInfo KREUZBERGDbfFieldInfo;
/**
* dBASE (DBF) file metadata.
*/
typedef struct KREUZBERGDbfMetadata KREUZBERGDbfMetadata;
/**
* MIME type detection response.
*/
typedef struct KREUZBERGDetectResponse KREUZBERGDetectResponse;
/**
* Page-level detection result containing all detections and page metadata.
*/
typedef struct KREUZBERGDetectionResult KREUZBERGDetectionResult;
/**
* A single contiguous hunk in a unified diff.
*/
typedef struct KREUZBERGDiffHunk KREUZBERGDiffHunk;
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate::diff` re-exports this type verbatim.
*/
typedef struct KREUZBERGDiffLine KREUZBERGDiffLine;
/**
* Options controlling how two `ExtractionResult` values are compared.
*/
typedef struct KREUZBERGDiffOptions KREUZBERGDiffOptions;
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
typedef struct KREUZBERGDjotContent KREUZBERGDjotContent;
/**
* Image element in Djot.
*/
typedef struct KREUZBERGDjotImage KREUZBERGDjotImage;
/**
* Link element in Djot.
*/
typedef struct KREUZBERGDjotLink KREUZBERGDjotLink;
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
* \\code
* use kreuzberg::plugins::{Plugin, DocumentExtractor};
* use kreuzberg::{Result, ExtractionConfig};
* use kreuzberg::types::internal::InternalDocument;
* use async_trait::async_trait;
* use std::path::Path;
*
* /// Custom PDF extractor with premium features
* struct PremiumPdfExtractor;
*
* impl Plugin for PremiumPdfExtractor {
* fn name(&self) -> &str { "premium-pdf" }
* fn version(&self) -> String { "2.0.0".to_string() }
* fn initialize(&self) -> Result<()> { Ok(()) }
* fn shutdown(&self) -> Result<()> { Ok(()) }
* }
*
* #[async_trait]
* impl DocumentExtractor for PremiumPdfExtractor {
* async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
* -> Result<InternalDocument> {
* // Premium extraction logic with better accuracy
* let mut doc = InternalDocument::new("pdf");
* // ... populate doc.elements, doc.metadata, etc.
* Ok(doc)
* }
*
* fn supported_mime_types(&self) -> &[&str] {
* &["application/pdf"]
* }
*
* fn priority(&self) -> i32 {
* 100 // Higher than default (50) - will be preferred
* }
* }
* \\endcode
*/
typedef struct KREUZBERGDocumentExtractor KREUZBERGDocumentExtractor;
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
typedef struct KREUZBERGDocumentNode KREUZBERGDocumentNode;
/**
* A resolved relationship between two nodes in the document tree.
*/
typedef struct KREUZBERGDocumentRelationship KREUZBERGDocumentRelationship;
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
typedef struct KREUZBERGDocumentRevision KREUZBERGDocumentRevision;
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
typedef struct KREUZBERGDocumentStructure KREUZBERGDocumentStructure;
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
typedef struct KREUZBERGDocxAppProperties KREUZBERGDocxAppProperties;
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
typedef struct KREUZBERGDocxMetadata KREUZBERGDocxMetadata;
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
typedef struct KREUZBERGElement KREUZBERGElement;
/**
* Metadata for a semantic element.
*/
typedef struct KREUZBERGElementMetadata KREUZBERGElementMetadata;
/**
* Semantic element type classification.
*
* Categorizes text content into semantic units for downstream processing.
* Supports the element types commonly found in Unstructured documents.
*/
typedef struct KREUZBERGElementType KREUZBERGElementType;
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
typedef struct KREUZBERGEmailAttachment KREUZBERGEmailAttachment;
/**
* Configuration for email extraction.
*/
typedef struct KREUZBERGEmailConfig KREUZBERGEmailConfig;
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
typedef struct KREUZBERGEmailExtractionResult KREUZBERGEmailExtractionResult;
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
typedef struct KREUZBERGEmailMetadata KREUZBERGEmailMetadata;
/**
* Changes to embedded archive children between two results.
*/
typedef struct KREUZBERGEmbeddedChanges KREUZBERGEmbeddedChanges;
/**
* Diff for a single embedded archive entry that appears in both results.
*/
typedef struct KREUZBERGEmbeddedDiff KREUZBERGEmbeddedDiff;
/**
* Embedded file descriptor extracted from the PDF name tree.
*/
typedef struct KREUZBERGEmbeddedFile KREUZBERGEmbeddedFile;
/**
* Trait for in-process embedding backend plugins.
*
* Async to match the convention used by `OcrBackend`,
* `DocumentExtractor`, and `PostProcessor`.
* Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
* C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
* equivalent to satisfy the async signature.
*
* # Thread safety
*
* Backends must be `Send + Sync + 'static`. They are stored in
* `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
* pipeline. If the backend's underlying model isn't thread-safe, the backend
* itself must serialize access internally (e.g. via `Mutex<Inner>`).
*
* # Contract
*
* - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
* `self.dimensions()`. The dispatcher in `embed_texts`
* validates this before returning to downstream consumers; a non-conforming
* backend surfaces as a `KreuzbergError::Validation`, not a panic.
* - `embed` may be called from any thread. Its future must be `Send`
* (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
* - `dimensions()` is called exactly once at registration, immediately after
* `initialize()` succeeds. The returned value is cached by the registry and
* used for all subsequent shape validation. Lazy-loading implementations can
* defer model loading into `initialize()` and report the real dimension
* afterwards. Later mutations of the backend's reported dimension are not
* observed by kreuzberg — implementations that need to change dimension
* must unregister and re-register.
* - `shutdown()` (inherited from `Plugin`) may be invoked
* concurrently with an in-flight `embed()` call. Implementations must
* tolerate this — e.g. by letting in-flight calls finish using resources
* held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
* shared state that isn't needed by `embed`.
*
* # Runtime
*
* The synchronous `embed_texts` entry uses
* [`tokio::task::block_in_place`] to await the trait's async `embed`, which
* requires a multi-thread tokio runtime. Callers running inside a
* `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
* or `tokio::runtime::Builder::new_current_thread()`) must use
* `embed_texts_async` instead, which awaits directly without
* `block_in_place`.
*/
typedef struct KREUZBERGEmbeddingBackend KREUZBERGEmbeddingBackend;
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
typedef struct KREUZBERGEmbeddingConfig KREUZBERGEmbeddingConfig;
/**
* Embedding model types supported by Kreuzberg.
*/
typedef struct KREUZBERGEmbeddingModelType KREUZBERGEmbeddingModelType;
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
typedef struct KREUZBERGEmbeddingPreset KREUZBERGEmbeddingPreset;
/**
* EPUB metadata (Dublin Core extensions).
*/
typedef struct KREUZBERGEpubMetadata KREUZBERGEpubMetadata;
/**
* Error metadata (for batch operations).
*/
typedef struct KREUZBERGErrorMetadata KREUZBERGErrorMetadata;
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
typedef struct KREUZBERGExcelMetadata KREUZBERGExcelMetadata;
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
typedef struct KREUZBERGExcelSheet KREUZBERGExcelSheet;
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
typedef struct KREUZBERGExcelWorkbook KREUZBERGExcelWorkbook;
/**
* ONNX Runtime execution provider type.
*
* Determines which hardware backend is used for model inference.
* `Auto` (default) selects the best available provider per platform.
*/
typedef struct KREUZBERGExecutionProviderType KREUZBERGExecutionProviderType;
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
typedef struct KREUZBERGExtractedImage KREUZBERGExtractedImage;
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
typedef struct KREUZBERGExtractedUri KREUZBERGExtractedUri;
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
* \\code
* use kreuzberg::core::config::ExtractionConfig;
*
* // Create with defaults
* let config = ExtractionConfig::default();
*
* // Load from TOML file
* // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
* \\endcode
*/
typedef struct KREUZBERGExtractionConfig KREUZBERGExtractionConfig;
/**
* The complete diff between two `ExtractionResult` values.
*/
typedef struct KREUZBERGExtractionDiff KREUZBERGExtractionDiff;
/**
* How the extracted text was produced.
*/
typedef struct KREUZBERGExtractionMethod KREUZBERGExtractionMethod;
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
typedef struct KREUZBERGExtractionResult KREUZBERGExtractionResult;
/**
* FictionBook (FB2) metadata.
*/
typedef struct KREUZBERGFictionBookMetadata KREUZBERGFictionBookMetadata;
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` — `None` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
* - `max_concurrent_extractions` — controls batch parallelism
* - `use_cache` — global caching policy
* - `acceleration` — shared ONNX execution provider
* - `security_limits` — global archive security policy
* \\code
* use kreuzberg::FileExtractionConfig;
*
* // Override just OCR forcing for a specific file
* let config = FileExtractionConfig {
* force_ocr: Some(true),
* ..Default::default()
* };
* \\endcode
*/
typedef struct KREUZBERGFileExtractionConfig KREUZBERGFileExtractionConfig;
/**
* Footnote in Djot.
*/
typedef struct KREUZBERGFootnote KREUZBERGFootnote;
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
typedef struct KREUZBERGFormatMetadata KREUZBERGFormatMetadata;
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
typedef struct KREUZBERGFormattedBlock KREUZBERGFormattedBlock;
/**
* Individual grid cell with position and span metadata.
*/
typedef struct KREUZBERGGridCell KREUZBERGGridCell;
/**
* Header/heading element metadata.
*/
typedef struct KREUZBERGHeaderMetadata KREUZBERGHeaderMetadata;
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
typedef struct KREUZBERGHeadingContext KREUZBERGHeadingContext;
/**
* A single heading in the hierarchy.
*/
typedef struct KREUZBERGHeadingLevel KREUZBERGHeadingLevel;
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
typedef struct KREUZBERGHierarchicalBlock KREUZBERGHierarchicalBlock;
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
typedef struct KREUZBERGHierarchyConfig KREUZBERGHierarchyConfig;
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
typedef struct KREUZBERGHtmlMetadata KREUZBERGHtmlMetadata;
/**
* Configuration for styled HTML output.
*
* When set on [`ExtractionConfig::html_output`] alongside
* `output_format = OutputFormat::Html`, the pipeline builds a
* `StyledHtmlRenderer` (crate::rendering::StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
* \\code
* use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
*
* let config = HtmlOutputConfig {
* theme: HtmlTheme::GitHub,
* css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
* ..Default::default()
* };
* \\endcode
*/
typedef struct KREUZBERGHtmlOutputConfig KREUZBERGHtmlOutputConfig;
/**
* Built-in HTML theme selection.
*/
typedef struct KREUZBERGHtmlTheme KREUZBERGHtmlTheme;
/**
* Image extraction configuration.
*/
typedef struct KREUZBERGImageExtractionConfig KREUZBERGImageExtractionConfig;
/**
* Heuristic classification of what an image likely depicts.
*/
typedef struct KREUZBERGImageKind KREUZBERGImageKind;
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
typedef struct KREUZBERGImageMetadata KREUZBERGImageMetadata;
/**
* Image element metadata.
*/
typedef struct KREUZBERGImageMetadataType KREUZBERGImageMetadataType;
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
typedef struct KREUZBERGImagePreprocessingConfig KREUZBERGImagePreprocessingConfig;
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
typedef struct KREUZBERGImagePreprocessingMetadata KREUZBERGImagePreprocessingMetadata;
/**
* Image type classification.
*/
typedef struct KREUZBERGImageType KREUZBERGImageType;
/**
* Inline element within a block.
*
* Represents text with formatting, links, images, etc.
*/
typedef struct KREUZBERGInlineElement KREUZBERGInlineElement;
/**
* Types of inline elements in Djot.
*/
typedef struct KREUZBERGInlineType KREUZBERGInlineType;
/**
* JATS (Journal Article Tag Suite) metadata.
*/
typedef struct KREUZBERGJatsMetadata KREUZBERGJatsMetadata;
/**
* Extracted keyword with metadata.
*/
typedef struct KREUZBERGKeyword KREUZBERGKeyword;
/**
* Keyword algorithm selection.
*/
typedef struct KREUZBERGKeywordAlgorithm KREUZBERGKeywordAlgorithm;
/**
* Keyword extraction configuration.
*/
typedef struct KREUZBERGKeywordConfig KREUZBERGKeywordConfig;
/**
* Language detection configuration.
*/
typedef struct KREUZBERGLanguageDetectionConfig KREUZBERGLanguageDetectionConfig;
/**
* The 17 canonical document layout classes.
*
* All model backends (RT-DETR, YOLO, etc.) map their native class IDs
* to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
* map to the closest equivalent.
*
* Wire format is snake_case in all serializers (JSON, TOML, YAML).
*/
typedef struct KREUZBERGLayoutClass KREUZBERGLayoutClass;
/**
* A single layout detection result.
*/
typedef struct KREUZBERGLayoutDetection KREUZBERGLayoutDetection;
/**
* Layout detection configuration.
*
* Controls layout detection behavior in the extraction pipeline.
* When set on `ExtractionConfig` (super::ExtractionConfig), layout detection
* is enabled for PDF extraction.
*/
typedef struct KREUZBERGLayoutDetectionConfig KREUZBERGLayoutDetectionConfig;
/**
* A detected layout region on a page.
*
* When layout detection is enabled, each page may have layout regions
* identifying different content types (text, pictures, tables, etc.)
* with confidence scores and spatial positions.
*/
typedef struct KREUZBERGLayoutRegion KREUZBERGLayoutRegion;
/**
* Link element metadata.
*/
typedef struct KREUZBERGLinkMetadata KREUZBERGLinkMetadata;
/**
* Link type classification.
*/
typedef struct KREUZBERGLinkType KREUZBERGLinkType;
/**
* Type of list detection.
*/
typedef struct KREUZBERGListType KREUZBERGListType;
/**
* Configuration for an LLM provider/model via liter-llm.
*
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
* its own `LlmConfig`, allowing different providers per feature.
* \\code
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
* \\endcode
*/
typedef struct KREUZBERGLlmConfig KREUZBERGLlmConfig;
/**
* Token usage and cost data for a single LLM call made during extraction.
*
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
* are used. Multiple entries may be present when multiple LLM calls occur
* within one extraction (e.g. VLM OCR + structured extraction).
*/
typedef struct KREUZBERGLlmUsage KREUZBERGLlmUsage;
/**
* Extraction result metadata.
*
* Contains common fields applicable to all formats, format-specific metadata
* via a discriminated union, and additional custom fields from postprocessors.
*/
typedef struct KREUZBERGMetadata KREUZBERGMetadata;
/**
* Combined paths to all models needed for OCR (backward compatibility).
*/
typedef struct KREUZBERGModelPaths KREUZBERGModelPaths;
/**
* Tagged enum for node content. Each variant carries only type-specific data.
*
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
* Go/Java/TypeScript bindings.
*/
typedef struct KREUZBERGNodeContent KREUZBERGNodeContent;
/**
* Trait for OCR backend plugins.
*
* Implement this trait to add custom OCR capabilities. OCR backends can be:
* - Native Rust implementations (like Tesseract)
* - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
* - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
*
* # Thread Safety
*
* OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
* \\code
* use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
* use kreuzberg::{Result, OcrConfig};
* use async_trait::async_trait;
* use std::borrow::Cow;
* use std::path::Path;
* use kreuzberg::types::{ExtractionResult, Metadata};
*
* struct CustomOcrBackend;
*
* impl Plugin for CustomOcrBackend {
* fn name(&self) -> &str { "custom-ocr" }
* fn version(&self) -> String { "1.0.0".to_string() }
* fn initialize(&self) -> Result<()> { Ok(()) }
* fn shutdown(&self) -> Result<()> { Ok(()) }
* }
*
* #[async_trait]
* impl OcrBackend for CustomOcrBackend {
* async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
* // Implement OCR logic here
* Ok(ExtractionResult {
* content: "Extracted text".to_string(),
* mime_type: Cow::Borrowed("text/plain"),
* ..Default::default()
* })
* }
*
* async fn process_image_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
* let bytes = std::fs::read(path)?;
* self.process_image(&bytes, config).await
* }
*
* fn supports_language(&self, lang: &str) -> bool {
* matches!(lang, "eng" | "deu" | "fra")
* }
*
* fn backend_type(&self) -> OcrBackendType {
* OcrBackendType::Custom
* }
* }
* \\endcode
*/
typedef struct KREUZBERGOcrBackend KREUZBERGOcrBackend;
/**
* OCR backend types.
*/
typedef struct KREUZBERGOcrBackendType KREUZBERGOcrBackendType;
/**
* Bounding geometry for an OCR element.
*
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
* (from PaddleOCR and rotated text detection).
*/
typedef struct KREUZBERGOcrBoundingGeometry KREUZBERGOcrBoundingGeometry;
/**
* Confidence scores for an OCR element.
*
* Separates detection confidence (how confident that text exists at this location)
* from recognition confidence (how confident about the actual text content).
*/
typedef struct KREUZBERGOcrConfidence KREUZBERGOcrConfidence;
/**
* OCR configuration.
*/
typedef struct KREUZBERGOcrConfig KREUZBERGOcrConfig;
/**
* A unified OCR element representing detected text with full metadata.
*
* This is the primary type for structured OCR output, preserving all information
* from both Tesseract and PaddleOCR backends.
*/
typedef struct KREUZBERGOcrElement KREUZBERGOcrElement;
/**
* Configuration for OCR element extraction.
*
* Controls how OCR elements are extracted and filtered.
*/
typedef struct KREUZBERGOcrElementConfig KREUZBERGOcrElementConfig;
/**
* Hierarchical level of an OCR element.
*
* Maps to Tesseract's page segmentation hierarchy and provides
* equivalent semantics for PaddleOCR.
*/
typedef struct KREUZBERGOcrElementLevel KREUZBERGOcrElementLevel;
/**
* OCR extraction result.
*
* Result of performing OCR on an image or scanned document,
* including recognized text and detected tables.
*/
typedef struct KREUZBERGOcrExtractionResult KREUZBERGOcrExtractionResult;
/**
* OCR processing metadata.
*
* Captures information about OCR processing configuration and results.
*/
typedef struct KREUZBERGOcrMetadata KREUZBERGOcrMetadata;
/**
* Multi-backend OCR pipeline with quality-based fallback.
*
* Backends are tried in priority order (highest first). After each backend
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
* the result is accepted. Otherwise the next backend is tried.
*/
typedef struct KREUZBERGOcrPipelineConfig KREUZBERGOcrPipelineConfig;
/**
* A single backend stage in the OCR pipeline.
*/
typedef struct KREUZBERGOcrPipelineStage KREUZBERGOcrPipelineStage;
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so `OcrQualityThresholds::default()` preserves existing semantics exactly.
*/
typedef struct KREUZBERGOcrQualityThresholds KREUZBERGOcrQualityThresholds;
/**
* Rotation information for an OCR element.
*/
typedef struct KREUZBERGOcrRotation KREUZBERGOcrRotation;
/**
* Table detected via OCR.
*
* Represents a table structure recognized during OCR processing.
*/
typedef struct KREUZBERGOcrTable KREUZBERGOcrTable;
/**
* Bounding box for an OCR-detected table in pixel coordinates.
*/
typedef struct KREUZBERGOcrTableBoundingBox KREUZBERGOcrTableBoundingBox;
/**
* Document orientation detection result.
*/
typedef struct KREUZBERGOrientationResult KREUZBERGOrientationResult;
/**
* Output format for extraction results.
*
* Controls the format of the `content` field in `ExtractionResult`.
* When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
* `Plain` returns the raw extracted text.
* `Structured` returns JSON with full OCR element data including bounding
* boxes and confidence scores.
*/
typedef struct KREUZBERGOutputFormat KREUZBERGOutputFormat;
/**
* Page Segmentation Mode for Tesseract OCR
*/
typedef struct KREUZBERGPSMMode KREUZBERGPSMMode;
/**
* Supported languages in PaddleOCR.
*
* Maps user-friendly language codes to paddle-ocr-rs language identifiers.
*/
typedef struct KREUZBERGPaddleLanguage KREUZBERGPaddleLanguage;
/**
* Configuration for PaddleOCR backend.
*
* Configures PaddleOCR text detection and recognition with multi-language support.
* Uses a builder pattern for convenient configuration.
* \\code
* use kreuzberg::PaddleOcrConfig;
*
* // Create with default English configuration
* let config = PaddleOcrConfig::new("en");
*
* // Create with custom cache directory
* let config = PaddleOcrConfig::new("ch")
* .with_cache_dir("/path/to/cache".into());
*
* // Enable table detection
* let config = PaddleOcrConfig::new("en")
* .with_table_detection(true);
* \\endcode
*/
typedef struct KREUZBERGPaddleOcrConfig KREUZBERGPaddleOcrConfig;
/**
* Byte offset boundary for a page.
*
* Tracks where a specific page's content starts and ends in the main content string,
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
*/
typedef struct KREUZBERGPageBoundary KREUZBERGPageBoundary;
/**
* Page extraction and tracking configuration.
*
* Controls how pages are extracted, tracked, and represented in the extraction results.
* When `None`, page tracking is disabled.
*
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
* when page boundaries are available and chunking is configured.
*/
typedef struct KREUZBERGPageConfig KREUZBERGPageConfig;
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
typedef struct KREUZBERGPageContent KREUZBERGPageContent;
/**
* Page hierarchy structure containing heading levels and block information.
*
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
* blocks with heading levels (H1-H6) for semantic document structure.
*/
typedef struct KREUZBERGPageHierarchy KREUZBERGPageHierarchy;
/**
* Metadata for individual page/slide/sheet.
*
* Captures per-page information including dimensions, content counts,
* and visibility state (for presentations).
*/
typedef struct KREUZBERGPageInfo KREUZBERGPageInfo;
/**
* Unified page structure for documents.
*
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
* with character offset boundaries for chunk-to-page mapping.
*/
typedef struct KREUZBERGPageStructure KREUZBERGPageStructure;
/**
* Type of paginated unit in a document.
*
* Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet
* sheets).
*/
typedef struct KREUZBERGPageUnitType KREUZBERGPageUnitType;
/**
* A PDF annotation extracted from a document page.
*/
typedef struct KREUZBERGPdfAnnotation KREUZBERGPdfAnnotation;
/**
* Type of PDF annotation.
*/
typedef struct KREUZBERGPdfAnnotationType KREUZBERGPdfAnnotationType;
/**
* PDF-specific configuration.
*/
typedef struct KREUZBERGPdfConfig KREUZBERGPdfConfig;
/**
* PDF-specific metadata.
*
* Contains metadata fields specific to PDF documents that are not in the common
* `Metadata` structure. Common fields like title, authors, keywords, and dates
* are at the `Metadata` level.
*/
typedef struct KREUZBERGPdfMetadata KREUZBERGPdfMetadata;
/**
* Base trait that all plugins must implement.
*
* This trait provides common functionality for plugin lifecycle management,
* identification, and metadata.
*
* # Thread Safety
*
* All plugins must be `Send + Sync` to support concurrent usage across threads.
* \\code
* use kreuzberg::plugins::Plugin;
* use kreuzberg::Result;
* use std::sync::atomic::{AtomicBool, Ordering};
*
* struct MyPlugin {
* initialized: AtomicBool,
* }
*
* impl Plugin for MyPlugin {
* fn name(&self) -> &str {
* "my-plugin"
* }
*
* fn version(&self) -> String {
* "1.0.0".to_string()
* }
*
* fn initialize(&self) -> Result<()> {
* self.initialized.store(true, Ordering::Release);
* println!("Plugin initialized!");
* Ok(())
* }
*
* fn shutdown(&self) -> Result<()> {
* self.initialized.store(false, Ordering::Release);
* println!("Plugin shutdown!");
* Ok(())
* }
* }
* \\endcode
*/
typedef struct KREUZBERGPlugin KREUZBERGPlugin;
/**
* Trait for post-processor plugins.
*
* Post-processors transform or enrich extraction results after the initial
* extraction is complete. They can:
* - Clean and normalize text
* - Add metadata (language, keywords, entities)
* - Split content into chunks
* - Score quality
* - Apply custom transformations
*
* # Processing Order
*
* Post-processors are executed in stage order:
* 1. **Early** - Language detection, entity extraction
* 2. **Middle** - Keyword extraction, token reduction
* 3. **Late** - Custom hooks, final validation
*
* Within each stage, processors are executed in registration order.
*
* # Error Handling
*
* Post-processor errors are non-fatal by default - they're captured in metadata
* and execution continues. To make errors fatal, return an error from `process()`.
*
* # Thread Safety
*
* Post-processors must be thread-safe (`Send + Sync`).
* \\code
* use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
* use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
* use async_trait::async_trait;
*
* /// Add word count metadata to extraction results
* struct WordCountProcessor;
*
* impl Plugin for WordCountProcessor {
* fn name(&self) -> &str { "word-count" }
* fn version(&self) -> String { "1.0.0".to_string() }
* fn initialize(&self) -> Result<()> { Ok(()) }
* fn shutdown(&self) -> Result<()> { Ok(()) }
* }
*
* #[async_trait]
* impl PostProcessor for WordCountProcessor {
* async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* // Count words
* let word_count = result.content.split_whitespace().count();
*
* // Add to metadata
* result.metadata.additional.insert("word_count".to_string().into(), serde_json::json!(word_count));
*
* Ok(())
* }
*
* fn processing_stage(&self) -> ProcessingStage {
* ProcessingStage::Early
* }
* }
* \\endcode
*/
typedef struct KREUZBERGPostProcessor KREUZBERGPostProcessor;
/**
* Post-processor configuration.
*/
typedef struct KREUZBERGPostProcessorConfig KREUZBERGPostProcessorConfig;
/**
* Application properties from docProps/app.xml for PPTX
*
* Contains PowerPoint-specific document metadata.
*/
typedef struct KREUZBERGPptxAppProperties KREUZBERGPptxAppProperties;
/**
* PowerPoint (PPTX) extraction result.
*
* Contains extracted slide content, metadata, and embedded images/tables.
*/
typedef struct KREUZBERGPptxExtractionResult KREUZBERGPptxExtractionResult;
/**
* PowerPoint presentation metadata.
*
* Extracted from PPTX files containing slide counts and presentation details.
*/
typedef struct KREUZBERGPptxMetadata KREUZBERGPptxMetadata;
/**
* Processing stages for post-processors.
*
* Post-processors are executed in stage order (Early → Middle → Late).
* Use stages to control the order of post-processing operations.
*/
typedef struct KREUZBERGProcessingStage KREUZBERGProcessingStage;
/**
* A non-fatal warning from a processing pipeline stage.
*
* Captures errors from optional features that don't prevent extraction
* but may indicate degraded results.
*/
typedef struct KREUZBERGProcessingWarning KREUZBERGProcessingWarning;
/**
* Outlook PST archive metadata.
*/
typedef struct KREUZBERGPstMetadata KREUZBERGPstMetadata;
/**
* RAKE-specific parameters.
*/
typedef struct KREUZBERGRakeParams KREUZBERGRakeParams;
/**
* Pre-computed table markdown for a table detection region.
*
* Produced by the TATR-based table structure recognizer and surfaced as part of
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
* so that consumers who do not enable `layout-detection` (ORT) can still reference
* the type in their own code.
*/
typedef struct KREUZBERGRecognizedTable KREUZBERGRecognizedTable;
typedef struct KREUZBERGReductionLevel KREUZBERGReductionLevel;
/**
* Semantic kind of a relationship between document elements.
*/
typedef struct KREUZBERGRelationshipKind KREUZBERGRelationshipKind;
/**
* Trait for document renderers that convert [`InternalDocument`] to output strings.
*
* Renderers are typically stateless converters that transform the internal
* document representation into a specific output format (Markdown, HTML,
* Djot, plain text, etc.). They participate in the standard [`Plugin`]
* lifecycle so custom renderers can be registered from any supported binding
* language.
*
* The format name is exposed via [`Plugin::name`]. For stateless renderers
* the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
* take no-op defaults and need not be overridden.
*
* # Thread Safety
*
* Renderers must be `Send + Sync` (inherited from [`Plugin`]).
* \\code
* use kreuzberg::plugins::{Plugin, Renderer};
* use kreuzberg::types::internal::InternalDocument;
* use kreuzberg::Result;
*
* struct CustomRenderer;
*
* impl Plugin for CustomRenderer {
* fn name(&self) -> &str { "custom" }
* }
*
* impl Renderer for CustomRenderer {
* fn render(&self, doc: &InternalDocument) -> Result<String> {
* Ok(format!("Custom output with {} elements", doc.elements.len()))
* }
* }
* \\endcode
*/
typedef struct KREUZBERGRenderer KREUZBERGRenderer;
/**
* Result-shape selection for extraction results.
*
* Distinct from `OutputFormat` (which controls rendering — Plain, Markdown,
* HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
* blob vs. an element-based decomposition.
*/
typedef struct KREUZBERGResultFormat KREUZBERGResultFormat;
/**
* Best-effort document location for a revision.
*/
typedef struct KREUZBERGRevisionAnchor KREUZBERGRevisionAnchor;
/**
* The content changes that make up a single revision.
*
* For insertions and deletions the `content` field carries the added/removed
* lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
* changes, `content` is empty — the property diff is left as a TODO for a
* later enrichment pass.
*/
typedef struct KREUZBERGRevisionDelta KREUZBERGRevisionDelta;
/**
* Semantic classification of a tracked change.
*/
typedef struct KREUZBERGRevisionKind KREUZBERGRevisionKind;
/**
* Configuration for security limits across extractors.
*
* All limits are intentionally conservative to prevent DoS attacks
* while still supporting legitimate documents.
*/
typedef struct KREUZBERGSecurityLimits KREUZBERGSecurityLimits;
/**
* API server configuration.
*
* This struct holds all configuration options for the Kreuzberg API server,
* including host/port settings, CORS configuration, and upload limits.
*
* # Defaults
*
* - `host`: "127.0.0.1" (localhost only)
* - `port`: 8000
* - `cors_origins`: empty vector (allows all origins)
* - `max_request_body_bytes`: 104_857_600 (100 MB)
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
*/
typedef struct KREUZBERGServerConfig KREUZBERGServerConfig;
/**
* Structured data (Schema.org, microdata, RDFa) block.
*/
typedef struct KREUZBERGStructuredData KREUZBERGStructuredData;
typedef struct KREUZBERGStructuredDataResult KREUZBERGStructuredDataResult;
/**
* Structured data type classification.
*/
typedef struct KREUZBERGStructuredDataType KREUZBERGStructuredDataType;
/**
* Configuration for LLM-based structured data extraction.
*
* Sends extracted document content to a VLM with a JSON schema,
* returning structured data that conforms to the schema.
* \\code
* [structured_extraction]
* schema_name = "invoice_data"
* strict = true
*
* [structured_extraction.schema]
* type = "object"
* properties.vendor = { type = "string" }
* properties.total = { type = "number" }
* required = ["vendor", "total"]
*
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* \\endcode
*/
typedef struct KREUZBERGStructuredExtractionConfig KREUZBERGStructuredExtractionConfig;
/**
* A supported document format entry.
*
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
*/
typedef struct KREUZBERGSupportedFormat KREUZBERGSupportedFormat;
/**
* Extracted table structure.
*
* Represents a table detected and extracted from a document (PDF, image, etc.).
* Tables are converted to both structured cell data and Markdown format.
*/
typedef struct KREUZBERGTable KREUZBERGTable;
/**
* Individual table cell with content and optional styling.
*
* Future extension point for rich table support with cell-level metadata.
*/
typedef struct KREUZBERGTableCell KREUZBERGTableCell;
/**
* Cell-level changes for a pair of tables that share the same index.
*/
typedef struct KREUZBERGTableDiff KREUZBERGTableDiff;
/**
* Structured table grid with cell-level metadata.
*
* Stores row/column dimensions and a flat list of cells with position info.
*/
typedef struct KREUZBERGTableGrid KREUZBERGTableGrid;
/**
* Which table structure recognition model to use.
*
* Controls the model used for table cell detection within layout-detected
* table regions. Wire format is snake_case in all serializers (JSON, TOML,
* YAML).
*/
typedef struct KREUZBERGTableModel KREUZBERGTableModel;
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
typedef struct KREUZBERGTesseractConfig KREUZBERGTesseractConfig;
/**
* Inline text annotation — byte-range based formatting and links.
*
* Annotations reference byte offsets into the node's text content,
* enabling precise identification of formatted regions.
*/
typedef struct KREUZBERGTextAnnotation KREUZBERGTextAnnotation;
/**
* Text direction enumeration for HTML documents.
*/
typedef struct KREUZBERGTextDirection KREUZBERGTextDirection;
/**
* Plain text and Markdown extraction result.
*
* Contains the extracted text along with statistics and,
* for Markdown files, structural elements like headers and links.
*/
typedef struct KREUZBERGTextExtractionResult KREUZBERGTextExtractionResult;
/**
* Text/Markdown metadata.
*
* Extracted from plain text and Markdown files. Includes word counts and,
* for Markdown, structural elements like headers and links.
*/
typedef struct KREUZBERGTextMetadata KREUZBERGTextMetadata;
typedef struct KREUZBERGTokenReductionConfig KREUZBERGTokenReductionConfig;
/**
* Token reduction configuration.
*/
typedef struct KREUZBERGTokenReductionOptions KREUZBERGTokenReductionOptions;
/**
* Configuration for tree-sitter language pack integration.
*
* Controls grammar download behavior and code analysis options.
*
* # Example (TOML)
*
* ```toml
* [tree_sitter]
* languages = ["python", "rust"]
* groups = ["web"]
*
* [tree_sitter.process]
* structure = true
* comments = true
* docstrings = true
* ```
*/
typedef struct KREUZBERGTreeSitterConfig KREUZBERGTreeSitterConfig;
/**
* Processing options for tree-sitter code analysis.
*
* Controls which analysis features are enabled when extracting code files.
*/
typedef struct KREUZBERGTreeSitterProcessConfig KREUZBERGTreeSitterProcessConfig;
/**
* Semantic classification of an extracted URI.
*/
typedef struct KREUZBERGUriKind KREUZBERGUriKind;
/**
* Trait for validator plugins.
*
* Validators check extraction results for quality, completeness, or correctness.
* Unlike post-processors, validator errors **fail fast** - if a validator returns
* an error, the extraction fails immediately.
*
* # Use Cases
*
* - **Quality Gates**: Ensure extracted content meets minimum quality standards
* - **Compliance**: Verify content meets regulatory requirements
* - **Content Filtering**: Reject documents containing unwanted content
* - **Format Validation**: Verify extracted content structure
* - **Security Checks**: Scan for malicious content
*
* # Error Handling
*
* Validator errors are **fatal** - they cause the extraction to fail and bubble up
* to the caller. Use validators for hard requirements that must be met.
*
* For non-fatal checks, use post-processors instead.
*
* # Thread Safety
*
* Validators must be thread-safe (`Send + Sync`).
* \\code
* use kreuzberg::plugins::{Plugin, Validator};
* use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
* use async_trait::async_trait;
*
* /// Validate that extracted content has minimum length
* struct MinimumLengthValidator {
* min_length: usize,
* }
*
* impl Plugin for MinimumLengthValidator {
* fn name(&self) -> &str { "min-length-validator" }
* fn version(&self) -> String { "1.0.0".to_string() }
* fn initialize(&self) -> Result<()> { Ok(()) }
* fn shutdown(&self) -> Result<()> { Ok(()) }
* }
*
* #[async_trait]
* impl Validator for MinimumLengthValidator {
* async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
* -> Result<()> {
* if result.content.len() < self.min_length {
* return Err(KreuzbergError::validation(format!(
* "Content too short: {} < {} characters",
* result.content.len(),
* self.min_length
* )));
* }
* Ok(())
* }
* }
* \\endcode
*/
typedef struct KREUZBERGValidator KREUZBERGValidator;
/**
* Application properties from docProps/app.xml for XLSX
*
* Contains Excel-specific document metadata.
*/
typedef struct KREUZBERGXlsxAppProperties KREUZBERGXlsxAppProperties;
/**
* XML extraction result.
*
* Contains extracted text content from XML files along with
* structural statistics about the XML document.
*/
typedef struct KREUZBERGXmlExtractionResult KREUZBERGXmlExtractionResult;
/**
* XML metadata extracted during XML parsing.
*
* Provides statistics about XML document structure.
*/
typedef struct KREUZBERGXmlMetadata KREUZBERGXmlMetadata;
/**
* YAKE-specific parameters.
*/
typedef struct KREUZBERGYakeParams KREUZBERGYakeParams;
/**
* Year range for bibliographic metadata.
*/
typedef struct KREUZBERGYearRange KREUZBERGYearRange;
"""
[defines]
"target_os = windows" = "SKIF_WINDOWS"
[export]
prefix = "KREUZBERG"
include = []
exclude = []
[fn]
args = "vertical"