fil/crates/kreuzberg-ffi/cbindgen.toml

# This file is auto-generated by alef. DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
language = "C"
include_guard = "KREUZBERG_H"
pragma_once = true
autogen_warning = "/* This file is auto-generated by alef. DO NOT EDIT. */"
after_includes = """
/* Opaque type forward declarations */
/**
 * Hardware acceleration configuration for ONNX Runtime models.
 *
 * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
 * for inference in layout detection and embedding generation.
 * \\code
 * use kreuzberg::AccelerationConfig;
 *
 * // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
 * let config = AccelerationConfig::default();
 *
 * // Force CPU only
 * let config = AccelerationConfig {
 *     provider: kreuzberg::ExecutionProviderType::Cpu,
 *     ..Default::default()
 * };
 * \\endcode
 */
typedef struct KREUZBERGAccelerationConfig KREUZBERGAccelerationConfig;
/**
 * Types of inline text annotations.
 */
typedef struct KREUZBERGAnnotationKind KREUZBERGAnnotationKind;
/**
 * A single file extracted from an archive.
 *
 * When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
 * enabled, each processable file produces its own full `ExtractionResult`.
 */
typedef struct KREUZBERGArchiveEntry KREUZBERGArchiveEntry;
/**
 * Archive (ZIP/TAR/7Z) metadata.
 *
 * Extracted from compressed archive files containing file lists and size information.
 */
typedef struct KREUZBERGArchiveMetadata KREUZBERGArchiveMetadata;
/**
 * Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
 */
typedef struct KREUZBERGBBox KREUZBERGBBox;
/**
 * Batch item for byte array extraction.
 *
 * Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
 * to represent a single item in a batch extraction job.
 */
typedef struct KREUZBERGBatchBytesItem KREUZBERGBatchBytesItem;
/**
 * Batch item for file extraction.
 *
 * Used with `batch_extract_files` and `batch_extract_files_sync`
 * to represent a single file in a batch extraction job.
 */
typedef struct KREUZBERGBatchFileItem KREUZBERGBatchFileItem;
/**
 * BibTeX bibliography metadata.
 */
typedef struct KREUZBERGBibtexMetadata KREUZBERGBibtexMetadata;
/**
 * Types of block-level elements in Djot.
 */
typedef struct KREUZBERGBlockType KREUZBERGBlockType;
/**
 * Bounding box coordinates for element positioning.
 */
typedef struct KREUZBERGBoundingBox KREUZBERGBoundingBox;
typedef struct KREUZBERGCacheStats KREUZBERGCacheStats;
/**
 * A single changed cell within a table.
 *
 * Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate::diff` re-exports this type verbatim.
 */
typedef struct KREUZBERGCellChange KREUZBERGCellChange;
/**
 * A text chunk with optional embedding and metadata.
 *
 * Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
 * contains the text content, optional embedding vector (if embedding generation
 * is configured), and metadata about its position in the document.
 */
typedef struct KREUZBERGChunk KREUZBERGChunk;
/**
 * Metadata about a chunk's position in the original document.
 */
typedef struct KREUZBERGChunkMetadata KREUZBERGChunkMetadata;
/**
 * How chunk size is measured.
 *
 * Defaults to `Characters` (Unicode character count). When using token-based sizing,
 * chunks are sized by token count according to the specified tokenizer.
 *
 * Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
 * available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
 * (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
 */
typedef struct KREUZBERGChunkSizing KREUZBERGChunkSizing;
/**
 * Semantic structural classification of a text chunk.
 *
 * Assigned by the heuristic classifier in `chunking::classifier`.
 * Defaults to `Unknown` when no rule matches.
 * Designed to be extended in future versions without breaking changes.
 */
typedef struct KREUZBERGChunkType KREUZBERGChunkType;
/**
 * Type of text chunker to use.
 *
 * # Variants
 *
 * * `Text` - Generic text splitter, splits on whitespace and punctuation
 * * `Markdown` - Markdown-aware splitter, preserves formatting and structure
 * * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
 * * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
 *   embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
 *   lower = more splits). Without an embedding, falls back to a
 *   structural-boundary heuristic (ALL-CAPS headers, numbered sections,
 *   blank-line paragraphs) and merges groups into chunks capped at
 *   `max_characters` (default 1000). `topic_threshold` has no effect in the
 *   fallback path. For best results, pair with an embedding model.
 */
typedef struct KREUZBERGChunkerType KREUZBERGChunkerType;
/**
 * Chunking configuration.
 *
 * Configures text chunking for document content, including chunk size,
 * overlap, trimming behavior, and optional embeddings.
 *
 * Use `..Default::default()` when constructing to allow for future field additions:
 * ```rust
 * let config = ChunkingConfig {
 *     max_characters: 500,
 *     ..Default::default()
 * };
 * ```
 */
typedef struct KREUZBERGChunkingConfig KREUZBERGChunkingConfig;
/**
 * Citation file metadata (RIS, PubMed, EndNote).
 */
typedef struct KREUZBERGCitationMetadata KREUZBERGCitationMetadata;
/**
 * Content rendering mode for code extraction.
 *
 * Controls how extracted code content is represented in the `content` field
 * of `ExtractionResult`.
 */
typedef struct KREUZBERGCodeContentMode KREUZBERGCodeContentMode;
/**
 * Cross-extractor content filtering configuration.
 *
 * Controls whether "furniture" content (headers, footers, page numbers,
 * watermarks, repeating text) is included in or stripped from extraction
 * results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
 * with format-specific implementation.
 *
 * When `None` on `ExtractionConfig`, each extractor uses its current
 * default behavior unchanged.
 */
typedef struct KREUZBERGContentFilterConfig KREUZBERGContentFilterConfig;
/**
 * Content layer classification for document nodes.
 *
 * Replaces separate body/furniture arrays with per-node granularity.
 */
typedef struct KREUZBERGContentLayer KREUZBERGContentLayer;
/**
 * JATS contributor with role.
 */
typedef struct KREUZBERGContributorRole KREUZBERGContributorRole;
/**
 * Dublin Core metadata from docProps/core.xml
 *
 * Contains standard metadata fields defined by the Dublin Core standard
 * and Office-specific extensions.
 */
typedef struct KREUZBERGCoreProperties KREUZBERGCoreProperties;
/**
 * CSV/TSV file metadata.
 */
typedef struct KREUZBERGCsvMetadata KREUZBERGCsvMetadata;
/**
 * dBASE field information.
 */
typedef struct KREUZBERGDbfFieldInfo KREUZBERGDbfFieldInfo;
/**
 * dBASE (DBF) file metadata.
 */
typedef struct KREUZBERGDbfMetadata KREUZBERGDbfMetadata;
/**
 * MIME type detection response.
 */
typedef struct KREUZBERGDetectResponse KREUZBERGDetectResponse;
/**
 * Page-level detection result containing all detections and page metadata.
 */
typedef struct KREUZBERGDetectionResult KREUZBERGDetectionResult;
/**
 * A single contiguous hunk in a unified diff.
 */
typedef struct KREUZBERGDiffHunk KREUZBERGDiffHunk;
/**
 * A single line in a unified-diff hunk.
 *
 * Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate::diff` re-exports this type verbatim.
 */
typedef struct KREUZBERGDiffLine KREUZBERGDiffLine;
/**
 * Options controlling how two `ExtractionResult` values are compared.
 */
typedef struct KREUZBERGDiffOptions KREUZBERGDiffOptions;
/**
 * Comprehensive Djot document structure with semantic preservation.
 *
 * This type captures the full richness of Djot markup, including:
 * - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
 * - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
 * - Attributes (classes, IDs, key-value pairs)
 * - Links, images, footnotes
 * - Math expressions (inline and display)
 * - Tables with full structure
 *
 * Available when the `djot` feature is enabled.
 */
typedef struct KREUZBERGDjotContent KREUZBERGDjotContent;
/**
 * Image element in Djot.
 */
typedef struct KREUZBERGDjotImage KREUZBERGDjotImage;
/**
 * Link element in Djot.
 */
typedef struct KREUZBERGDjotLink KREUZBERGDjotLink;
/**
 * Trait for document extractor plugins.
 *
 * Implement this trait to add support for new document formats or to override
 * built-in extraction behavior with custom logic.
 *
 * # Return Type
 *
 * Extractors return `InternalDocument`, a flat intermediate representation.
 * The pipeline converts this into the public `ExtractionResult` via the
 * derivation step.
 *
 * # Priority System
 *
 * When multiple extractors support the same MIME type, the registry selects
 * the extractor with the highest priority value. Use this to:
 * - Override built-in extractors (priority > 50)
 * - Provide fallback extractors (priority < 50)
 * - Implement specialized extractors for specific use cases
 *
 * Default priority is 50.
 *
 * # Thread Safety
 *
 * Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
 * \\code
 * use kreuzberg::plugins::{Plugin, DocumentExtractor};
 * use kreuzberg::{Result, ExtractionConfig};
 * use kreuzberg::types::internal::InternalDocument;
 * use async_trait::async_trait;
 * use std::path::Path;
 *
 * /// Custom PDF extractor with premium features
 * struct PremiumPdfExtractor;
 *
 * impl Plugin for PremiumPdfExtractor {
 *     fn name(&self) -> &str { "premium-pdf" }
 *     fn version(&self) -> String { "2.0.0".to_string() }
 *     fn initialize(&self) -> Result<()> { Ok(()) }
 *     fn shutdown(&self) -> Result<()> { Ok(()) }
 * }
 *
 * #[async_trait]
 * impl DocumentExtractor for PremiumPdfExtractor {
 *     async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
 *         -> Result<InternalDocument> {
 *         // Premium extraction logic with better accuracy
 *         let mut doc = InternalDocument::new("pdf");
 *         // ... populate doc.elements, doc.metadata, etc.
 *         Ok(doc)
 *     }
 *
 *     fn supported_mime_types(&self) -> &[&str] {
 *         &["application/pdf"]
 *     }
 *
 *     fn priority(&self) -> i32 {
 *         100  // Higher than default (50) - will be preferred
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGDocumentExtractor KREUZBERGDocumentExtractor;
/**
 * A single node in the document tree.
 *
 * Each node has deterministic `id`, typed `content`, optional `parent`/`children`
 * for tree structure, and metadata like page number, bounding box, and content layer.
 */
typedef struct KREUZBERGDocumentNode KREUZBERGDocumentNode;
/**
 * A resolved relationship between two nodes in the document tree.
 */
typedef struct KREUZBERGDocumentRelationship KREUZBERGDocumentRelationship;
/**
 * A single tracked change embedded in a document.
 *
 * Populated by per-format extractors that understand change-tracking metadata
 * (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, â¦). Every
 * extractor defaults to `ExtractionResult.revisions = None` until a
 * format-specific implementation is added.
 */
typedef struct KREUZBERGDocumentRevision KREUZBERGDocumentRevision;
/**
 * Top-level structured document representation.
 *
 * A flat array of nodes with index-based parent/child references forming a tree.
 * Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
 * to iterate over top-level content by layer.
 *
 * # Validation
 *
 * Call `validate()` after construction to verify all node indices are in bounds
 * and parent-child relationships are bidirectionally consistent.
 */
typedef struct KREUZBERGDocumentStructure KREUZBERGDocumentStructure;
/**
 * Application properties from docProps/app.xml for DOCX
 *
 * Contains Word-specific document statistics and metadata.
 */
typedef struct KREUZBERGDocxAppProperties KREUZBERGDocxAppProperties;
/**
 * Word document metadata.
 *
 * Extracted from DOCX files using shared Office Open XML metadata extraction.
 * Integrates with `office_metadata` module for core/app/custom properties.
 */
typedef struct KREUZBERGDocxMetadata KREUZBERGDocxMetadata;
/**
 * Semantic element extracted from document.
 *
 * Represents a logical unit of content with semantic classification,
 * unique identifier, and metadata for tracking origin and position.
 */
typedef struct KREUZBERGElement KREUZBERGElement;
/**
 * Metadata for a semantic element.
 */
typedef struct KREUZBERGElementMetadata KREUZBERGElementMetadata;
/**
 * Semantic element type classification.
 *
 * Categorizes text content into semantic units for downstream processing.
 * Supports the element types commonly found in Unstructured documents.
 */
typedef struct KREUZBERGElementType KREUZBERGElementType;
/**
 * Email attachment representation.
 *
 * Contains metadata and optionally the content of an email attachment.
 */
typedef struct KREUZBERGEmailAttachment KREUZBERGEmailAttachment;
/**
 * Configuration for email extraction.
 */
typedef struct KREUZBERGEmailConfig KREUZBERGEmailConfig;
/**
 * Email extraction result.
 *
 * Complete representation of an extracted email message (.eml or .msg)
 * including headers, body content, and attachments.
 */
typedef struct KREUZBERGEmailExtractionResult KREUZBERGEmailExtractionResult;
/**
 * Email metadata extracted from .eml and .msg files.
 *
 * Includes sender/recipient information, message ID, and attachment list.
 */
typedef struct KREUZBERGEmailMetadata KREUZBERGEmailMetadata;
/**
 * Changes to embedded archive children between two results.
 */
typedef struct KREUZBERGEmbeddedChanges KREUZBERGEmbeddedChanges;
/**
 * Diff for a single embedded archive entry that appears in both results.
 */
typedef struct KREUZBERGEmbeddedDiff KREUZBERGEmbeddedDiff;
/**
 * Embedded file descriptor extracted from the PDF name tree.
 */
typedef struct KREUZBERGEmbeddedFile KREUZBERGEmbeddedFile;
/**
 * Trait for in-process embedding backend plugins.
 *
 * Async to match the convention used by `OcrBackend`,
 * `DocumentExtractor`, and `PostProcessor`.
 * Host-language bridges (PyO3, napi-rs, Rustler, extendr, magnus, ext-php-rs,
 * C FFI, etc.) wrap their synchronous host callables in `spawn_blocking` or the
 * equivalent to satisfy the async signature.
 *
 * # Thread safety
 *
 * Backends must be `Send + Sync + 'static`. They are stored in
 * `Arc<dyn EmbeddingBackend>` and called concurrently from kreuzberg's chunking
 * pipeline. If the backend's underlying model isn't thread-safe, the backend
 * itself must serialize access internally (e.g. via `Mutex<Inner>`).
 *
 * # Contract
 *
 * - `embed(texts)` MUST return exactly `texts.len()` vectors, each of length
 *   `self.dimensions()`. The dispatcher in `embed_texts`
 *   validates this before returning to downstream consumers; a non-conforming
 *   backend surfaces as a `KreuzbergError::Validation`, not a panic.
 * - `embed` may be called from any thread. Its future must be `Send`
 *   (enforced by `async_trait` when `#[async_trait]` is used on non-WASM targets).
 * - `dimensions()` is called exactly once at registration, immediately after
 *   `initialize()` succeeds. The returned value is cached by the registry and
 *   used for all subsequent shape validation. Lazy-loading implementations can
 *   defer model loading into `initialize()` and report the real dimension
 *   afterwards. Later mutations of the backend's reported dimension are not
 *   observed by kreuzberg â implementations that need to change dimension
 *   must unregister and re-register.
 * - `shutdown()` (inherited from `Plugin`) may be invoked
 *   concurrently with an in-flight `embed()` call. Implementations must
 *   tolerate this â e.g. by letting in-flight calls finish using resources
 *   held via the `Arc<dyn EmbeddingBackend>` reference, and only releasing
 *   shared state that isn't needed by `embed`.
 *
 * # Runtime
 *
 * The synchronous `embed_texts` entry uses
 * [`tokio::task::block_in_place`] to await the trait's async `embed`, which
 * requires a multi-thread tokio runtime. Callers running inside a
 * `current_thread` runtime (e.g. `#[tokio::test]` without `flavor = "multi_thread"`,
 * or `tokio::runtime::Builder::new_current_thread()`) must use
 * `embed_texts_async` instead, which awaits directly without
 * `block_in_place`.
 */
typedef struct KREUZBERGEmbeddingBackend KREUZBERGEmbeddingBackend;
/**
 * Embedding configuration for text chunks.
 *
 * Configures embedding generation using ONNX models via the vendored embedding engine.
 * Requires the `embeddings` feature to be enabled.
 */
typedef struct KREUZBERGEmbeddingConfig KREUZBERGEmbeddingConfig;
/**
 * Embedding model types supported by Kreuzberg.
 */
typedef struct KREUZBERGEmbeddingModelType KREUZBERGEmbeddingModelType;
/**
 * Preset configurations for common RAG use cases.
 *
 * Each preset combines chunk size, overlap, and embedding model
 * to provide an optimized configuration for specific scenarios.
 *
 * All string fields are owned `String` for FFI compatibility â instances
 * are safe to clone and pass across language boundaries.
 */
typedef struct KREUZBERGEmbeddingPreset KREUZBERGEmbeddingPreset;
/**
 * EPUB metadata (Dublin Core extensions).
 */
typedef struct KREUZBERGEpubMetadata KREUZBERGEpubMetadata;
/**
 * Error metadata (for batch operations).
 */
typedef struct KREUZBERGErrorMetadata KREUZBERGErrorMetadata;
/**
 * Excel/spreadsheet format metadata.
 *
 * Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
 * discriminant. Sheet count and sheet names are stored inside this struct.
 */
typedef struct KREUZBERGExcelMetadata KREUZBERGExcelMetadata;
/**
 * Single Excel worksheet.
 *
 * Represents one sheet from an Excel workbook with its content
 * converted to Markdown format and dimensional statistics.
 */
typedef struct KREUZBERGExcelSheet KREUZBERGExcelSheet;
/**
 * Excel workbook representation.
 *
 * Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
 * extracted content and metadata.
 */
typedef struct KREUZBERGExcelWorkbook KREUZBERGExcelWorkbook;
/**
 * ONNX Runtime execution provider type.
 *
 * Determines which hardware backend is used for model inference.
 * `Auto` (default) selects the best available provider per platform.
 */
typedef struct KREUZBERGExecutionProviderType KREUZBERGExecutionProviderType;
/**
 * Extracted image from a document.
 *
 * Contains raw image data, metadata, and optional nested OCR results.
 * Raw bytes allow cross-language compatibility - users can convert to
 * PIL.Image (Python), Sharp (Node.js), or other formats as needed.
 */
typedef struct KREUZBERGExtractedImage KREUZBERGExtractedImage;
/**
 * A URI extracted from a document.
 *
 * Represents any link, reference, or resource pointer found during extraction.
 * The `kind` field classifies the URI semantically, while `label` carries
 * optional human-readable display text.
 */
typedef struct KREUZBERGExtractedUri KREUZBERGExtractedUri;
/**
 * Main extraction configuration.
 *
 * This struct contains all configuration options for the extraction process.
 * It can be loaded from TOML, YAML, or JSON files, or created programmatically.
 * \\code
 * use kreuzberg::core::config::ExtractionConfig;
 *
 * // Create with defaults
 * let config = ExtractionConfig::default();
 *
 * // Load from TOML file
 * // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
 * \\endcode
 */
typedef struct KREUZBERGExtractionConfig KREUZBERGExtractionConfig;
/**
 * The complete diff between two `ExtractionResult` values.
 */
typedef struct KREUZBERGExtractionDiff KREUZBERGExtractionDiff;
/**
 * How the extracted text was produced.
 */
typedef struct KREUZBERGExtractionMethod KREUZBERGExtractionMethod;
/**
 * General extraction result used by the core extraction API.
 *
 * This is the main result type returned by all extraction functions.
 */
typedef struct KREUZBERGExtractionResult KREUZBERGExtractionResult;
/**
 * FictionBook (FB2) metadata.
 */
typedef struct KREUZBERGFictionBookMetadata KREUZBERGFictionBookMetadata;
/**
 * Per-file extraction configuration overrides for batch processing.
 *
 * All fields are `Option<T>` â `None` means "use the batch-level default."
 * This type is used with `batch_extract_files` and
 * `batch_extract_bytes` to allow heterogeneous
 * extraction settings within a single batch.
 *
 * # Excluded Fields
 *
 * The following `ExtractionConfig` fields are batch-level only and
 * cannot be overridden per file:
 * - `max_concurrent_extractions` â controls batch parallelism
 * - `use_cache` â global caching policy
 * - `acceleration` â shared ONNX execution provider
 * - `security_limits` â global archive security policy
 * \\code
 * use kreuzberg::FileExtractionConfig;
 *
 * // Override just OCR forcing for a specific file
 * let config = FileExtractionConfig {
 *     force_ocr: Some(true),
 *     ..Default::default()
 * };
 * \\endcode
 */
typedef struct KREUZBERGFileExtractionConfig KREUZBERGFileExtractionConfig;
/**
 * Footnote in Djot.
 */
typedef struct KREUZBERGFootnote KREUZBERGFootnote;
/**
 * Format-specific metadata (discriminated union).
 *
 * Only one format type can exist per extraction result. This provides
 * type-safe, clean metadata without nested optionals.
 */
typedef struct KREUZBERGFormatMetadata KREUZBERGFormatMetadata;
/**
 * Block-level element in a Djot document.
 *
 * Represents structural elements like headings, paragraphs, lists, code blocks, etc.
 */
typedef struct KREUZBERGFormattedBlock KREUZBERGFormattedBlock;
/**
 * Individual grid cell with position and span metadata.
 */
typedef struct KREUZBERGGridCell KREUZBERGGridCell;
/**
 * Header/heading element metadata.
 */
typedef struct KREUZBERGHeaderMetadata KREUZBERGHeaderMetadata;
/**
 * Heading context for a chunk within a Markdown document.
 *
 * Contains the heading hierarchy from document root to this chunk's section.
 */
typedef struct KREUZBERGHeadingContext KREUZBERGHeadingContext;
/**
 * A single heading in the hierarchy.
 */
typedef struct KREUZBERGHeadingLevel KREUZBERGHeadingLevel;
/**
 * A text block with hierarchy level assignment.
 *
 * Represents a block of text with semantic heading information extracted from
 * font size clustering and hierarchical analysis.
 */
typedef struct KREUZBERGHierarchicalBlock KREUZBERGHierarchicalBlock;
/**
 * Hierarchy extraction configuration for PDF text structure analysis.
 *
 * Enables extraction of document hierarchy levels (H1-H6) based on font size
 * clustering and semantic analysis. When enabled, hierarchical blocks are
 * included in page content.
 */
typedef struct KREUZBERGHierarchyConfig KREUZBERGHierarchyConfig;
/**
 * HTML metadata extracted from HTML documents.
 *
 * Includes document-level metadata, Open Graph data, Twitter Card metadata,
 * and extracted structural elements (headers, links, images, structured data).
 */
typedef struct KREUZBERGHtmlMetadata KREUZBERGHtmlMetadata;
/**
 * Configuration for styled HTML output.
 *
 * When set on [`ExtractionConfig::html_output`] alongside
 * `output_format = OutputFormat::Html`, the pipeline builds a
 * `StyledHtmlRenderer` (crate::rendering::StyledHtmlRenderer) instead of
 * the plain comrak-based renderer.
 * \\code
 * use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
 *
 * let config = HtmlOutputConfig {
 *     theme: HtmlTheme::GitHub,
 *     css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
 *     ..Default::default()
 * };
 * \\endcode
 */
typedef struct KREUZBERGHtmlOutputConfig KREUZBERGHtmlOutputConfig;
/**
 * Built-in HTML theme selection.
 */
typedef struct KREUZBERGHtmlTheme KREUZBERGHtmlTheme;
/**
 * Image extraction configuration.
 */
typedef struct KREUZBERGImageExtractionConfig KREUZBERGImageExtractionConfig;
/**
 * Heuristic classification of what an image likely depicts.
 */
typedef struct KREUZBERGImageKind KREUZBERGImageKind;
/**
 * Image metadata extracted from image files.
 *
 * Includes dimensions, format, and EXIF data.
 */
typedef struct KREUZBERGImageMetadata KREUZBERGImageMetadata;
/**
 * Image element metadata.
 */
typedef struct KREUZBERGImageMetadataType KREUZBERGImageMetadataType;
/**
 * Image preprocessing configuration for OCR.
 *
 * These settings control how images are preprocessed before OCR to improve
 * text recognition quality. Different preprocessing strategies work better
 * for different document types.
 */
typedef struct KREUZBERGImagePreprocessingConfig KREUZBERGImagePreprocessingConfig;
/**
 * Image preprocessing metadata.
 *
 * Tracks the transformations applied to an image during OCR preprocessing,
 * including DPI normalization, resizing, and resampling.
 */
typedef struct KREUZBERGImagePreprocessingMetadata KREUZBERGImagePreprocessingMetadata;
/**
 * Image type classification.
 */
typedef struct KREUZBERGImageType KREUZBERGImageType;
/**
 * Inline element within a block.
 *
 * Represents text with formatting, links, images, etc.
 */
typedef struct KREUZBERGInlineElement KREUZBERGInlineElement;
/**
 * Types of inline elements in Djot.
 */
typedef struct KREUZBERGInlineType KREUZBERGInlineType;
/**
 * JATS (Journal Article Tag Suite) metadata.
 */
typedef struct KREUZBERGJatsMetadata KREUZBERGJatsMetadata;
/**
 * Extracted keyword with metadata.
 */
typedef struct KREUZBERGKeyword KREUZBERGKeyword;
/**
 * Keyword algorithm selection.
 */
typedef struct KREUZBERGKeywordAlgorithm KREUZBERGKeywordAlgorithm;
/**
 * Keyword extraction configuration.
 */
typedef struct KREUZBERGKeywordConfig KREUZBERGKeywordConfig;
/**
 * Language detection configuration.
 */
typedef struct KREUZBERGLanguageDetectionConfig KREUZBERGLanguageDetectionConfig;
/**
 * The 17 canonical document layout classes.
 *
 * All model backends (RT-DETR, YOLO, etc.) map their native class IDs
 * to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
 * map to the closest equivalent.
 *
 * Wire format is snake_case in all serializers (JSON, TOML, YAML).
 */
typedef struct KREUZBERGLayoutClass KREUZBERGLayoutClass;
/**
 * A single layout detection result.
 */
typedef struct KREUZBERGLayoutDetection KREUZBERGLayoutDetection;
/**
 * Layout detection configuration.
 *
 * Controls layout detection behavior in the extraction pipeline.
 * When set on `ExtractionConfig` (super::ExtractionConfig), layout detection
 * is enabled for PDF extraction.
 */
typedef struct KREUZBERGLayoutDetectionConfig KREUZBERGLayoutDetectionConfig;
/**
 * A detected layout region on a page.
 *
 * When layout detection is enabled, each page may have layout regions
 * identifying different content types (text, pictures, tables, etc.)
 * with confidence scores and spatial positions.
 */
typedef struct KREUZBERGLayoutRegion KREUZBERGLayoutRegion;
/**
 * Link element metadata.
 */
typedef struct KREUZBERGLinkMetadata KREUZBERGLinkMetadata;
/**
 * Link type classification.
 */
typedef struct KREUZBERGLinkType KREUZBERGLinkType;
/**
 * Type of list detection.
 */
typedef struct KREUZBERGListType KREUZBERGListType;
/**
 * Configuration for an LLM provider/model via liter-llm.
 *
 * Each feature (VLM OCR, VLM embeddings, structured extraction) carries
 * its own `LlmConfig`, allowing different providers per feature.
 * \\code
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * api_key = "sk-..."  # or use KREUZBERG_LLM_API_KEY env var
 * \\endcode
 */
typedef struct KREUZBERGLlmConfig KREUZBERGLlmConfig;
/**
 * Token usage and cost data for a single LLM call made during extraction.
 *
 * Populated when VLM OCR, structured extraction, or LLM-based embeddings
 * are used. Multiple entries may be present when multiple LLM calls occur
 * within one extraction (e.g. VLM OCR + structured extraction).
 */
typedef struct KREUZBERGLlmUsage KREUZBERGLlmUsage;
/**
 * Extraction result metadata.
 *
 * Contains common fields applicable to all formats, format-specific metadata
 * via a discriminated union, and additional custom fields from postprocessors.
 */
typedef struct KREUZBERGMetadata KREUZBERGMetadata;
/**
 * Combined paths to all models needed for OCR (backward compatibility).
 */
typedef struct KREUZBERGModelPaths KREUZBERGModelPaths;
/**
 * Tagged enum for node content. Each variant carries only type-specific data.
 *
 * Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
 * Go/Java/TypeScript bindings.
 */
typedef struct KREUZBERGNodeContent KREUZBERGNodeContent;
/**
 * Trait for OCR backend plugins.
 *
 * Implement this trait to add custom OCR capabilities. OCR backends can be:
 * - Native Rust implementations (like Tesseract)
 * - FFI bridges to Python libraries (like EasyOCR, PaddleOCR)
 * - Cloud-based OCR services (Google Vision, AWS Textract, etc.)
 *
 * # Thread Safety
 *
 * OCR backends must be thread-safe (`Send + Sync`) to support concurrent processing.
 * \\code
 * use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
 * use kreuzberg::{Result, OcrConfig};
 * use async_trait::async_trait;
 * use std::borrow::Cow;
 * use std::path::Path;
 * use kreuzberg::types::{ExtractionResult, Metadata};
 *
 * struct CustomOcrBackend;
 *
 * impl Plugin for CustomOcrBackend {
 *     fn name(&self) -> &str { "custom-ocr" }
 *     fn version(&self) -> String { "1.0.0".to_string() }
 *     fn initialize(&self) -> Result<()> { Ok(()) }
 *     fn shutdown(&self) -> Result<()> { Ok(()) }
 * }
 *
 * #[async_trait]
 * impl OcrBackend for CustomOcrBackend {
 *     async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
 *         // Implement OCR logic here
 *         Ok(ExtractionResult {
 *             content: "Extracted text".to_string(),
 *             mime_type: Cow::Borrowed("text/plain"),
 *             ..Default::default()
 *         })
 *     }
 *
 *     async fn process_image_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
 *         let bytes = std::fs::read(path)?;
 *         self.process_image(&bytes, config).await
 *     }
 *
 *     fn supports_language(&self, lang: &str) -> bool {
 *         matches!(lang, "eng" | "deu" | "fra")
 *     }
 *
 *     fn backend_type(&self) -> OcrBackendType {
 *         OcrBackendType::Custom
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGOcrBackend KREUZBERGOcrBackend;
/**
 * OCR backend types.
 */
typedef struct KREUZBERGOcrBackendType KREUZBERGOcrBackendType;
/**
 * Bounding geometry for an OCR element.
 *
 * Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
 * (from PaddleOCR and rotated text detection).
 */
typedef struct KREUZBERGOcrBoundingGeometry KREUZBERGOcrBoundingGeometry;
/**
 * Confidence scores for an OCR element.
 *
 * Separates detection confidence (how confident that text exists at this location)
 * from recognition confidence (how confident about the actual text content).
 */
typedef struct KREUZBERGOcrConfidence KREUZBERGOcrConfidence;
/**
 * OCR configuration.
 */
typedef struct KREUZBERGOcrConfig KREUZBERGOcrConfig;
/**
 * A unified OCR element representing detected text with full metadata.
 *
 * This is the primary type for structured OCR output, preserving all information
 * from both Tesseract and PaddleOCR backends.
 */
typedef struct KREUZBERGOcrElement KREUZBERGOcrElement;
/**
 * Configuration for OCR element extraction.
 *
 * Controls how OCR elements are extracted and filtered.
 */
typedef struct KREUZBERGOcrElementConfig KREUZBERGOcrElementConfig;
/**
 * Hierarchical level of an OCR element.
 *
 * Maps to Tesseract's page segmentation hierarchy and provides
 * equivalent semantics for PaddleOCR.
 */
typedef struct KREUZBERGOcrElementLevel KREUZBERGOcrElementLevel;
/**
 * OCR extraction result.
 *
 * Result of performing OCR on an image or scanned document,
 * including recognized text and detected tables.
 */
typedef struct KREUZBERGOcrExtractionResult KREUZBERGOcrExtractionResult;
/**
 * OCR processing metadata.
 *
 * Captures information about OCR processing configuration and results.
 */
typedef struct KREUZBERGOcrMetadata KREUZBERGOcrMetadata;
/**
 * Multi-backend OCR pipeline with quality-based fallback.
 *
 * Backends are tried in priority order (highest first). After each backend
 * produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
 * the result is accepted. Otherwise the next backend is tried.
 */
typedef struct KREUZBERGOcrPipelineConfig KREUZBERGOcrPipelineConfig;
/**
 * A single backend stage in the OCR pipeline.
 */
typedef struct KREUZBERGOcrPipelineStage KREUZBERGOcrPipelineStage;
/**
 * Quality thresholds for OCR fallback decisions and pipeline quality gating.
 *
 * All fields default to the values that match the previous hardcoded behavior,
 * so `OcrQualityThresholds::default()` preserves existing semantics exactly.
 */
typedef struct KREUZBERGOcrQualityThresholds KREUZBERGOcrQualityThresholds;
/**
 * Rotation information for an OCR element.
 */
typedef struct KREUZBERGOcrRotation KREUZBERGOcrRotation;
/**
 * Table detected via OCR.
 *
 * Represents a table structure recognized during OCR processing.
 */
typedef struct KREUZBERGOcrTable KREUZBERGOcrTable;
/**
 * Bounding box for an OCR-detected table in pixel coordinates.
 */
typedef struct KREUZBERGOcrTableBoundingBox KREUZBERGOcrTableBoundingBox;
/**
 * Document orientation detection result.
 */
typedef struct KREUZBERGOrientationResult KREUZBERGOrientationResult;
/**
 * Output format for extraction results.
 *
 * Controls the format of the `content` field in `ExtractionResult`.
 * When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
 * `Plain` returns the raw extracted text.
 * `Structured` returns JSON with full OCR element data including bounding
 * boxes and confidence scores.
 */
typedef struct KREUZBERGOutputFormat KREUZBERGOutputFormat;
/**
 * Page Segmentation Mode for Tesseract OCR
 */
typedef struct KREUZBERGPSMMode KREUZBERGPSMMode;
/**
 * Supported languages in PaddleOCR.
 *
 * Maps user-friendly language codes to paddle-ocr-rs language identifiers.
 */
typedef struct KREUZBERGPaddleLanguage KREUZBERGPaddleLanguage;
/**
 * Configuration for PaddleOCR backend.
 *
 * Configures PaddleOCR text detection and recognition with multi-language support.
 * Uses a builder pattern for convenient configuration.
 * \\code
 * use kreuzberg::PaddleOcrConfig;
 *
 * // Create with default English configuration
 * let config = PaddleOcrConfig::new("en");
 *
 * // Create with custom cache directory
 * let config = PaddleOcrConfig::new("ch")
 *     .with_cache_dir("/path/to/cache".into());
 *
 * // Enable table detection
 * let config = PaddleOcrConfig::new("en")
 *     .with_table_detection(true);
 * \\endcode
 */
typedef struct KREUZBERGPaddleOcrConfig KREUZBERGPaddleOcrConfig;
/**
 * Byte offset boundary for a page.
 *
 * Tracks where a specific page's content starts and ends in the main content string,
 * enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
 * at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
 */
typedef struct KREUZBERGPageBoundary KREUZBERGPageBoundary;
/**
 * Page extraction and tracking configuration.
 *
 * Controls how pages are extracted, tracked, and represented in the extraction results.
 * When `None`, page tracking is disabled.
 *
 * Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
 * when page boundaries are available and chunking is configured.
 */
typedef struct KREUZBERGPageConfig KREUZBERGPageConfig;
/**
 * Content for a single page/slide.
 *
 * When page extraction is enabled, documents are split into per-page content
 * with associated tables and images mapped to each page.
 *
 * # Performance
 *
 * Uses Arc-wrapped tables and images for memory efficiency:
 * - `Vec<Arc<Table>>` enables zero-copy sharing of table data
 * - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
 * - Maintains exact JSON compatibility via custom Serialize/Deserialize
 *
 * This reduces memory overhead for documents with shared tables/images
 * by avoiding redundant copies during serialization.
 */
typedef struct KREUZBERGPageContent KREUZBERGPageContent;
/**
 * Page hierarchy structure containing heading levels and block information.
 *
 * Used when PDF text hierarchy extraction is enabled. Contains hierarchical
 * blocks with heading levels (H1-H6) for semantic document structure.
 */
typedef struct KREUZBERGPageHierarchy KREUZBERGPageHierarchy;
/**
 * Metadata for individual page/slide/sheet.
 *
 * Captures per-page information including dimensions, content counts,
 * and visibility state (for presentations).
 */
typedef struct KREUZBERGPageInfo KREUZBERGPageInfo;
/**
 * Unified page structure for documents.
 *
 * Supports different page types (PDF pages, PPTX slides, Excel sheets)
 * with character offset boundaries for chunk-to-page mapping.
 */
typedef struct KREUZBERGPageStructure KREUZBERGPageStructure;
/**
 * Type of paginated unit in a document.
 *
 * Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet
 * sheets).
 */
typedef struct KREUZBERGPageUnitType KREUZBERGPageUnitType;
/**
 * A PDF annotation extracted from a document page.
 */
typedef struct KREUZBERGPdfAnnotation KREUZBERGPdfAnnotation;
/**
 * Type of PDF annotation.
 */
typedef struct KREUZBERGPdfAnnotationType KREUZBERGPdfAnnotationType;
/**
 * PDF-specific configuration.
 */
typedef struct KREUZBERGPdfConfig KREUZBERGPdfConfig;
/**
 * PDF-specific metadata.
 *
 * Contains metadata fields specific to PDF documents that are not in the common
 * `Metadata` structure. Common fields like title, authors, keywords, and dates
 * are at the `Metadata` level.
 */
typedef struct KREUZBERGPdfMetadata KREUZBERGPdfMetadata;
/**
 * Base trait that all plugins must implement.
 *
 * This trait provides common functionality for plugin lifecycle management,
 * identification, and metadata.
 *
 * # Thread Safety
 *
 * All plugins must be `Send + Sync` to support concurrent usage across threads.
 * \\code
 * use kreuzberg::plugins::Plugin;
 * use kreuzberg::Result;
 * use std::sync::atomic::{AtomicBool, Ordering};
 *
 * struct MyPlugin {
 *     initialized: AtomicBool,
 * }
 *
 * impl Plugin for MyPlugin {
 *     fn name(&self) -> &str {
 *         "my-plugin"
 *     }
 *
 *     fn version(&self) -> String {
 *         "1.0.0".to_string()
 *     }
 *
 *     fn initialize(&self) -> Result<()> {
 *         self.initialized.store(true, Ordering::Release);
 *         println!("Plugin initialized!");
 *         Ok(())
 *     }
 *
 *     fn shutdown(&self) -> Result<()> {
 *         self.initialized.store(false, Ordering::Release);
 *         println!("Plugin shutdown!");
 *         Ok(())
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGPlugin KREUZBERGPlugin;
/**
 * Trait for post-processor plugins.
 *
 * Post-processors transform or enrich extraction results after the initial
 * extraction is complete. They can:
 * - Clean and normalize text
 * - Add metadata (language, keywords, entities)
 * - Split content into chunks
 * - Score quality
 * - Apply custom transformations
 *
 * # Processing Order
 *
 * Post-processors are executed in stage order:
 * 1. **Early** - Language detection, entity extraction
 * 2. **Middle** - Keyword extraction, token reduction
 * 3. **Late** - Custom hooks, final validation
 *
 * Within each stage, processors are executed in registration order.
 *
 * # Error Handling
 *
 * Post-processor errors are non-fatal by default - they're captured in metadata
 * and execution continues. To make errors fatal, return an error from `process()`.
 *
 * # Thread Safety
 *
 * Post-processors must be thread-safe (`Send + Sync`).
 * \\code
 * use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
 * use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
 * use async_trait::async_trait;
 *
 * /// Add word count metadata to extraction results
 * struct WordCountProcessor;
 *
 * impl Plugin for WordCountProcessor {
 *     fn name(&self) -> &str { "word-count" }
 *     fn version(&self) -> String { "1.0.0".to_string() }
 *     fn initialize(&self) -> Result<()> { Ok(()) }
 *     fn shutdown(&self) -> Result<()> { Ok(()) }
 * }
 *
 * #[async_trait]
 * impl PostProcessor for WordCountProcessor {
 *     async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig)
 *         -> Result<()> {
 *         // Count words
 *         let word_count = result.content.split_whitespace().count();
 *
 *         // Add to metadata
 *         result.metadata.additional.insert("word_count".to_string().into(), serde_json::json!(word_count));
 *
 *         Ok(())
 *     }
 *
 *     fn processing_stage(&self) -> ProcessingStage {
 *         ProcessingStage::Early
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGPostProcessor KREUZBERGPostProcessor;
/**
 * Post-processor configuration.
 */
typedef struct KREUZBERGPostProcessorConfig KREUZBERGPostProcessorConfig;
/**
 * Application properties from docProps/app.xml for PPTX
 *
 * Contains PowerPoint-specific document metadata.
 */
typedef struct KREUZBERGPptxAppProperties KREUZBERGPptxAppProperties;
/**
 * PowerPoint (PPTX) extraction result.
 *
 * Contains extracted slide content, metadata, and embedded images/tables.
 */
typedef struct KREUZBERGPptxExtractionResult KREUZBERGPptxExtractionResult;
/**
 * PowerPoint presentation metadata.
 *
 * Extracted from PPTX files containing slide counts and presentation details.
 */
typedef struct KREUZBERGPptxMetadata KREUZBERGPptxMetadata;
/**
 * Processing stages for post-processors.
 *
 * Post-processors are executed in stage order (Early â Middle â Late).
 * Use stages to control the order of post-processing operations.
 */
typedef struct KREUZBERGProcessingStage KREUZBERGProcessingStage;
/**
 * A non-fatal warning from a processing pipeline stage.
 *
 * Captures errors from optional features that don't prevent extraction
 * but may indicate degraded results.
 */
typedef struct KREUZBERGProcessingWarning KREUZBERGProcessingWarning;
/**
 * Outlook PST archive metadata.
 */
typedef struct KREUZBERGPstMetadata KREUZBERGPstMetadata;
/**
 * RAKE-specific parameters.
 */
typedef struct KREUZBERGRakeParams KREUZBERGRakeParams;
/**
 * Pre-computed table markdown for a table detection region.
 *
 * Produced by the TATR-based table structure recognizer and surfaced as part of
 * layout-aware OCR results.  The struct lives here (under `layout-types`, pure-Rust)
 * so that consumers who do not enable `layout-detection` (ORT) can still reference
 * the type in their own code.
 */
typedef struct KREUZBERGRecognizedTable KREUZBERGRecognizedTable;
typedef struct KREUZBERGReductionLevel KREUZBERGReductionLevel;
/**
 * Semantic kind of a relationship between document elements.
 */
typedef struct KREUZBERGRelationshipKind KREUZBERGRelationshipKind;
/**
 * Trait for document renderers that convert [`InternalDocument`] to output strings.
 *
 * Renderers are typically stateless converters that transform the internal
 * document representation into a specific output format (Markdown, HTML,
 * Djot, plain text, etc.). They participate in the standard [`Plugin`]
 * lifecycle so custom renderers can be registered from any supported binding
 * language.
 *
 * The format name is exposed via [`Plugin::name`]. For stateless renderers
 * the [`Plugin`] lifecycle methods (`version`, `initialize`, `shutdown`) all
 * take no-op defaults and need not be overridden.
 *
 * # Thread Safety
 *
 * Renderers must be `Send + Sync` (inherited from [`Plugin`]).
 * \\code
 * use kreuzberg::plugins::{Plugin, Renderer};
 * use kreuzberg::types::internal::InternalDocument;
 * use kreuzberg::Result;
 *
 * struct CustomRenderer;
 *
 * impl Plugin for CustomRenderer {
 *     fn name(&self) -> &str { "custom" }
 * }
 *
 * impl Renderer for CustomRenderer {
 *     fn render(&self, doc: &InternalDocument) -> Result<String> {
 *         Ok(format!("Custom output with {} elements", doc.elements.len()))
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGRenderer KREUZBERGRenderer;
/**
 * Result-shape selection for extraction results.
 *
 * Distinct from `OutputFormat` (which controls rendering â Plain, Markdown,
 * HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
 * blob vs. an element-based decomposition.
 */
typedef struct KREUZBERGResultFormat KREUZBERGResultFormat;
/**
 * Best-effort document location for a revision.
 */
typedef struct KREUZBERGRevisionAnchor KREUZBERGRevisionAnchor;
/**
 * The content changes that make up a single revision.
 *
 * For insertions and deletions the `content` field carries the added/removed
 * lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
 * changes, `content` is empty â the property diff is left as a TODO for a
 * later enrichment pass.
 */
typedef struct KREUZBERGRevisionDelta KREUZBERGRevisionDelta;
/**
 * Semantic classification of a tracked change.
 */
typedef struct KREUZBERGRevisionKind KREUZBERGRevisionKind;
/**
 * Configuration for security limits across extractors.
 *
 * All limits are intentionally conservative to prevent DoS attacks
 * while still supporting legitimate documents.
 */
typedef struct KREUZBERGSecurityLimits KREUZBERGSecurityLimits;
/**
 * API server configuration.
 *
 * This struct holds all configuration options for the Kreuzberg API server,
 * including host/port settings, CORS configuration, and upload limits.
 *
 * # Defaults
 *
 * - `host`: "127.0.0.1" (localhost only)
 * - `port`: 8000
 * - `cors_origins`: empty vector (allows all origins)
 * - `max_request_body_bytes`: 104_857_600 (100 MB)
 * - `max_multipart_field_bytes`: 104_857_600 (100 MB)
 */
typedef struct KREUZBERGServerConfig KREUZBERGServerConfig;
/**
 * Structured data (Schema.org, microdata, RDFa) block.
 */
typedef struct KREUZBERGStructuredData KREUZBERGStructuredData;
typedef struct KREUZBERGStructuredDataResult KREUZBERGStructuredDataResult;
/**
 * Structured data type classification.
 */
typedef struct KREUZBERGStructuredDataType KREUZBERGStructuredDataType;
/**
 * Configuration for LLM-based structured data extraction.
 *
 * Sends extracted document content to a VLM with a JSON schema,
 * returning structured data that conforms to the schema.
 * \\code
 * [structured_extraction]
 * schema_name = "invoice_data"
 * strict = true
 *
 * [structured_extraction.schema]
 * type = "object"
 * properties.vendor = { type = "string" }
 * properties.total = { type = "number" }
 * required = ["vendor", "total"]
 *
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * \\endcode
 */
typedef struct KREUZBERGStructuredExtractionConfig KREUZBERGStructuredExtractionConfig;
/**
 * A supported document format entry.
 *
 * Represents a file extension and its corresponding MIME type that Kreuzberg can process.
 */
typedef struct KREUZBERGSupportedFormat KREUZBERGSupportedFormat;
/**
 * Extracted table structure.
 *
 * Represents a table detected and extracted from a document (PDF, image, etc.).
 * Tables are converted to both structured cell data and Markdown format.
 */
typedef struct KREUZBERGTable KREUZBERGTable;
/**
 * Individual table cell with content and optional styling.
 *
 * Future extension point for rich table support with cell-level metadata.
 */
typedef struct KREUZBERGTableCell KREUZBERGTableCell;
/**
 * Cell-level changes for a pair of tables that share the same index.
 */
typedef struct KREUZBERGTableDiff KREUZBERGTableDiff;
/**
 * Structured table grid with cell-level metadata.
 *
 * Stores row/column dimensions and a flat list of cells with position info.
 */
typedef struct KREUZBERGTableGrid KREUZBERGTableGrid;
/**
 * Which table structure recognition model to use.
 *
 * Controls the model used for table cell detection within layout-detected
 * table regions. Wire format is snake_case in all serializers (JSON, TOML,
 * YAML).
 */
typedef struct KREUZBERGTableModel KREUZBERGTableModel;
/**
 * Tesseract OCR configuration.
 *
 * Provides fine-grained control over Tesseract OCR engine parameters.
 * Most users can use the defaults, but these settings allow optimization
 * for specific document types (invoices, handwriting, etc.).
 */
typedef struct KREUZBERGTesseractConfig KREUZBERGTesseractConfig;
/**
 * Inline text annotation â byte-range based formatting and links.
 *
 * Annotations reference byte offsets into the node's text content,
 * enabling precise identification of formatted regions.
 */
typedef struct KREUZBERGTextAnnotation KREUZBERGTextAnnotation;
/**
 * Text direction enumeration for HTML documents.
 */
typedef struct KREUZBERGTextDirection KREUZBERGTextDirection;
/**
 * Plain text and Markdown extraction result.
 *
 * Contains the extracted text along with statistics and,
 * for Markdown files, structural elements like headers and links.
 */
typedef struct KREUZBERGTextExtractionResult KREUZBERGTextExtractionResult;
/**
 * Text/Markdown metadata.
 *
 * Extracted from plain text and Markdown files. Includes word counts and,
 * for Markdown, structural elements like headers and links.
 */
typedef struct KREUZBERGTextMetadata KREUZBERGTextMetadata;
typedef struct KREUZBERGTokenReductionConfig KREUZBERGTokenReductionConfig;
/**
 * Token reduction configuration.
 */
typedef struct KREUZBERGTokenReductionOptions KREUZBERGTokenReductionOptions;
/**
 * Configuration for tree-sitter language pack integration.
 *
 * Controls grammar download behavior and code analysis options.
 *
 * # Example (TOML)
 *
 * ```toml
 * [tree_sitter]
 * languages = ["python", "rust"]
 * groups = ["web"]
 *
 * [tree_sitter.process]
 * structure = true
 * comments = true
 * docstrings = true
 * ```
 */
typedef struct KREUZBERGTreeSitterConfig KREUZBERGTreeSitterConfig;
/**
 * Processing options for tree-sitter code analysis.
 *
 * Controls which analysis features are enabled when extracting code files.
 */
typedef struct KREUZBERGTreeSitterProcessConfig KREUZBERGTreeSitterProcessConfig;
/**
 * Semantic classification of an extracted URI.
 */
typedef struct KREUZBERGUriKind KREUZBERGUriKind;
/**
 * Trait for validator plugins.
 *
 * Validators check extraction results for quality, completeness, or correctness.
 * Unlike post-processors, validator errors **fail fast** - if a validator returns
 * an error, the extraction fails immediately.
 *
 * # Use Cases
 *
 * - **Quality Gates**: Ensure extracted content meets minimum quality standards
 * - **Compliance**: Verify content meets regulatory requirements
 * - **Content Filtering**: Reject documents containing unwanted content
 * - **Format Validation**: Verify extracted content structure
 * - **Security Checks**: Scan for malicious content
 *
 * # Error Handling
 *
 * Validator errors are **fatal** - they cause the extraction to fail and bubble up
 * to the caller. Use validators for hard requirements that must be met.
 *
 * For non-fatal checks, use post-processors instead.
 *
 * # Thread Safety
 *
 * Validators must be thread-safe (`Send + Sync`).
 * \\code
 * use kreuzberg::plugins::{Plugin, Validator};
 * use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
 * use async_trait::async_trait;
 *
 * /// Validate that extracted content has minimum length
 * struct MinimumLengthValidator {
 *     min_length: usize,
 * }
 *
 * impl Plugin for MinimumLengthValidator {
 *     fn name(&self) -> &str { "min-length-validator" }
 *     fn version(&self) -> String { "1.0.0".to_string() }
 *     fn initialize(&self) -> Result<()> { Ok(()) }
 *     fn shutdown(&self) -> Result<()> { Ok(()) }
 * }
 *
 * #[async_trait]
 * impl Validator for MinimumLengthValidator {
 *     async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
 *         -> Result<()> {
 *         if result.content.len() < self.min_length {
 *             return Err(KreuzbergError::validation(format!(
 *                 "Content too short: {} < {} characters",
 *                 result.content.len(),
 *                 self.min_length
 *             )));
 *         }
 *         Ok(())
 *     }
 * }
 * \\endcode
 */
typedef struct KREUZBERGValidator KREUZBERGValidator;
/**
 * Application properties from docProps/app.xml for XLSX
 *
 * Contains Excel-specific document metadata.
 */
typedef struct KREUZBERGXlsxAppProperties KREUZBERGXlsxAppProperties;
/**
 * XML extraction result.
 *
 * Contains extracted text content from XML files along with
 * structural statistics about the XML document.
 */
typedef struct KREUZBERGXmlExtractionResult KREUZBERGXmlExtractionResult;
/**
 * XML metadata extracted during XML parsing.
 *
 * Provides statistics about XML document structure.
 */
typedef struct KREUZBERGXmlMetadata KREUZBERGXmlMetadata;
/**
 * YAKE-specific parameters.
 */
typedef struct KREUZBERGYakeParams KREUZBERGYakeParams;
/**
 * Year range for bibliographic metadata.
 */
typedef struct KREUZBERGYearRange KREUZBERGYearRange;
"""
[defines]
"target_os = windows" = "SKIF_WINDOWS"

[export]
prefix = "KREUZBERG"
include = []
exclude = []

[fn]
args = "vertical"