packages/php/stubs/kreuzberg_extension.php

<?php

// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
// Type stubs for the native PHP extension — declares classes
// provided at runtime by the compiled Rust extension (.so/.dll).
// Include this in phpstan.neon scanFiles for static analysis.

declare(strict_types=1);

namespace Kreuzberg {
class KreuzbergException extends \RuntimeException
{
    public function getErrorCode(): int { throw new \RuntimeException('Not implemented.'); }
}

final class CacheStats
{
    public function __construct(
        /** @var int */
        public readonly int $totalFiles,
        /** @var float */
        public readonly float $totalSizeMb,
        /** @var float */
        public readonly float $availableSpaceMb,
        /** @var float */
        public readonly float $oldestFileAgeDays,
        /** @var float */
        public readonly float $newestFileAgeDays
    ) { }
}

/**
 * Hardware acceleration configuration for ONNX Runtime models.
 *
 * Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
 * for inference in layout detection and embedding generation.
 *
 * # Example
 */
final class AccelerationConfig
{
    public function __construct(
        /** @var ExecutionProviderType Execution provider to use for ONNX inference. */
        public readonly ExecutionProviderType $provider,
        /** @var int GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
        public readonly int $deviceId
    ) { }
}

/**
 * Cross-extractor content filtering configuration.
 *
 * Controls whether "furniture" content (headers, footers, page numbers,
 * watermarks, repeating text) is included in or stripped from extraction
 * results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
 * with format-specific implementation.
 *
 * When `None` on `ExtractionConfig`, each extractor uses its current
 * default behavior unchanged.
 */
final class ContentFilterConfig
{
    public function __construct(
        /**
         * Include running headers in extraction output.
         *
         * - PDF: Disables top-margin furniture stripping and prevents the layout
         * model from treating `PageHeader`-classified regions as furniture.
         * - DOCX: Includes document headers in text output.
         * - RTF/ODT: Headers already included; this is a no-op when true.
         * - HTML/EPUB: Keeps `<header>` element content.
         *
         * Default: `false` (headers are stripped or excluded).
         *
         * @var bool
         */
        public readonly bool $includeHeaders,
        /**
         * Include running footers in extraction output.
         *
         * - PDF: Disables bottom-margin furniture stripping and prevents the layout
         * model from treating `PageFooter`-classified regions as furniture.
         * - DOCX: Includes document footers in text output.
         * - RTF/ODT: Footers already included; this is a no-op when true.
         * - HTML/EPUB: Keeps `<footer>` element content.
         *
         * Default: `false` (footers are stripped or excluded).
         *
         * @var bool
         */
        public readonly bool $includeFooters,
        /**
         * Enable the heuristic cross-page repeating text detector.
         *
         * When `true` (default), text that repeats verbatim across a supermajority
         * of pages is classified as furniture and stripped.  Disable this if brand
         * names or repeated headings are being incorrectly removed by the heuristic.
         *
         * Note: when a layout-detection model is active, the model may independently
         * classify page-header / page-footer regions as furniture on a per-page basis.
         * To preserve those regions, set `include_headers = true`, `include_footers = true`,
         * or both, in addition to disabling this flag.
         *
         * Primarily affects PDF extraction.
         *
         * Default: `true`.
         *
         * @var bool
         */
        public readonly bool $stripRepeatingText,
        /**
         * Include watermark text in extraction output.
         *
         * - PDF: Keeps watermark artifacts and arXiv identifiers.
         * - Other formats: No effect currently.
         *
         * Default: `false` (watermarks are stripped).
         *
         * @var bool
         */
        public readonly bool $includeWatermarks
    ) { }
    public static function default(): ContentFilterConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Configuration for email extraction.
 */
final class EmailConfig
{
    public function __construct(
        /**
         * Windows codepage number to use when an MSG file contains no codepage property.
         * Defaults to `None`, which falls back to windows-1252.
         *
         * If an unrecognized or invalid codepage number is supplied (including 0),
         * the behavior silently falls back to windows-1252 — the same as when the
         * MSG file itself contains an unrecognized codepage. No error or warning is
         * emitted. Users should verify output when supplying unusual values.
         *
         * Common values:
         * - 1250: Central European (Polish, Czech, Hungarian, etc.)
         * - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
         * - 1252: Western European (default)
         * - 1253: Greek
         * - 1254: Turkish
         * - 1255: Hebrew
         * - 1256: Arabic
         * - 932:  Japanese (Shift-JIS)
         * - 936:  Simplified Chinese (GBK)
         *
         * @var ?int
         */
        public readonly ?int $msgFallbackCodepage = null
    ) { }
}

/**
 * Main extraction configuration.
 *
 * This struct contains all configuration options for the extraction process.
 * It can be loaded from TOML, YAML, or JSON files, or created programmatically.
 *
 * # Example
 */
final class ExtractionConfig
{
    public function __construct(
        /** @var bool Enable caching of extraction results */
        public readonly bool $useCache,
        /** @var bool Enable quality post-processing */
        public readonly bool $enableQualityProcessing,
        /** @var bool Force OCR even for searchable PDFs */
        public readonly bool $forceOcr,
        /**
         * Disable OCR entirely, even for images.
         *
         * When `true`, OCR is skipped for all document types. Images return metadata
         * only (dimensions, format, EXIF) without text extraction. PDFs use only
         * native text extraction without OCR fallback.
         *
         * Cannot be `true` simultaneously with `force_ocr`.
         *
         * *Added in v4.7.0.*
         *
         * @var bool
         */
        public readonly bool $disableOcr,
        /**
         * Result structure format
         *
         * Controls whether results are returned in unified format (default) with all
         * content in the `content` field, or element-based format with semantic
         * elements (for Unstructured-compatible output).
         *
         * @var ResultFormat
         */
        public readonly ResultFormat $resultFormat,
        /**
         * Content text format (default: Plain).
         *
         * Controls the format of the extracted content:
         * - `Plain`: Raw extracted text (default)
         * - `Markdown`: Markdown formatted output
         * - `Djot`: Djot markup format (requires djot feature)
         * - `Html`: HTML formatted output
         *
         * When set to a structured format, extraction results will include
         * formatted output. The `formatted_content` field may be populated
         * when format conversion is applied.
         *
         * @var OutputFormat
         */
        public readonly OutputFormat $outputFormat,
        /**
         * Run layout detection on the non-OCR PDF markdown path.
         *
         * When `true` and `layout` is `Some(_)`, layout regions inform heading,
         * table, list, and figure detection in the structure pipeline that would
         * otherwise rely on font-clustering heuristics alone. Significantly
         * improves SF1 (structural F1) at the cost of inference latency
         * (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
         * Requires the `layout-detection` feature.
         *
         * @var bool
         */
        public readonly bool $useLayoutForMarkdown,
        /**
         * Enable structured document tree output.
         *
         * When true, populates the `document` field on `ExtractionResult` with a
         * hierarchical `DocumentStructure` containing heading-driven section nesting,
         * table grids, content layer classification, and inline annotations.
         *
         * Independent of `result_format` — can be combined with Unified or ElementBased.
         *
         * @var bool
         */
        public readonly bool $includeDocumentStructure,
        /**
         * Maximum recursion depth for archive extraction (default: 3).
         * Set to 0 to disable recursive extraction (legacy behavior).
         *
         * @var int
         */
        public readonly int $maxArchiveDepth,
        /** @var ?OcrConfig OCR configuration (None = OCR disabled) */
        public readonly ?OcrConfig $ocr = null,
        /**
         * Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
         *
         * When set, only the listed pages are OCR'd regardless of text layer quality.
         * Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
         * Only applies to PDF documents. Duplicates are automatically deduplicated.
         * An `ocr` config is recommended for backend/language selection; defaults are used if absent.
         *
         * @var ?array<int>
         */
        public readonly ?array $forceOcrPages = null,
        /** @var ?ChunkingConfig Text chunking configuration (None = chunking disabled) */
        public readonly ?ChunkingConfig $chunking = null,
        /**
         * Content filtering configuration (None = use extractor defaults).
         *
         * Controls whether document "furniture" (headers, footers, watermarks,
         * repeating text) is included in or stripped from extraction results.
         * See [`ContentFilterConfig`] for per-field documentation.
         *
         * @var ?ContentFilterConfig
         */
        public readonly ?ContentFilterConfig $contentFilter = null,
        /** @var ?ImageExtractionConfig Image extraction configuration (None = no image extraction) */
        public readonly ?ImageExtractionConfig $images = null,
        /** @var ?PdfConfig PDF-specific options (None = use defaults) */
        public readonly ?PdfConfig $pdfOptions = null,
        /** @var ?TokenReductionOptions Token reduction configuration (None = no token reduction) */
        public readonly ?TokenReductionOptions $tokenReduction = null,
        /** @var ?LanguageDetectionConfig Language detection configuration (None = no language detection) */
        public readonly ?LanguageDetectionConfig $languageDetection = null,
        /** @var ?PageConfig Page extraction configuration (None = no page tracking) */
        public readonly ?PageConfig $pages = null,
        /** @var ?KeywordConfig Keyword extraction configuration (None = no keyword extraction) */
        public readonly ?KeywordConfig $keywords = null,
        /** @var ?PostProcessorConfig Post-processor configuration (None = use defaults) */
        public readonly ?PostProcessorConfig $postprocessor = null,
        /**
         * HTML to Markdown conversion options (None = use defaults)
         *
         * Configure how HTML documents are converted to Markdown, including heading styles,
         * list formatting, code block styles, and preprocessing options.
         *
         * @var ?string
         */
        public readonly ?string $htmlOptions = null,
        /**
         * Styled HTML output configuration.
         *
         * When set alongside `output_format = OutputFormat::Html`, the extraction
         * pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
         * which emits stable `kb-*` CSS class hooks on every structural element
         * and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
         *
         * When `None`, the existing plain comrak-based HTML renderer is used.
         *
         * @var ?HtmlOutputConfig
         */
        public readonly ?HtmlOutputConfig $htmlOutput = null,
        /**
         * Default per-file timeout in seconds for batch extraction.
         *
         * When set, each file in a batch will be canceled after this duration
         * unless overridden by [`FileExtractionConfig::timeout_secs`].
         *
         * Defaults to `Some(60)` to prevent pathological files (e.g. deeply
         * nested archives, documents with millions of cells) from running
         * indefinitely and exhausting caller resources. Set to `None` to
         * disable the timeout for trusted input or long-running workloads.
         *
         * @var ?int
         */
        public readonly ?int $extractionTimeoutSecs = null,
        /**
         * Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
         *
         * Limits parallelism to prevent resource exhaustion when processing
         * large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
         *
         * @var ?int
         */
        public readonly ?int $maxConcurrentExtractions = null,
        /**
         * Security limits for archive extraction.
         *
         * Controls maximum archive size, compression ratio, file count, and other
         * security thresholds to prevent decompression bomb attacks. Also caps
         * nesting depth, iteration count, entity / token length, total
         * content size, and table cell count for every extraction path that
         * ingests user-controlled bytes.
         * When `None`, default limits are used.
         *
         * @var ?SecurityLimits
         */
        public readonly ?SecurityLimits $securityLimits = null,
        /**
         * Maximum uncompressed size in bytes for a single embedded file before
         * recursive extraction is attempted (default: 50 MiB).
         *
         * Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
         * to email attachments processed via recursive extraction. Files that
         * exceed this limit are skipped with a `ProcessingWarning` rather than
         * passed to the extraction pipeline, preventing a single oversized
         * embedded object from consuming unbounded memory or time.
         *
         * Set to `None` to disable the per-embedded-file cap (falls back to
         * `security_limits.max_archive_size` as the only guard).
         *
         * @var ?int
         */
        public readonly ?int $maxEmbeddedFileBytes = null,
        /**
         * Layout detection configuration (None = layout detection disabled).
         *
         * When set, PDF pages and images are analyzed for document structure
         * (headings, code, formulas, tables, figures, etc.) using RT-DETR models
         * via ONNX Runtime. For PDFs, layout hints override paragraph classification
         * in the markdown pipeline. For images, per-region OCR is performed with
         * markdown formatting based on detected layout classes.
         * Requires the `layout-detection` feature to run inference; the field is
         * present whenever the `layout-types` feature is active (which includes
         * `layout-detection` as well as the no-ORT target groups).
         *
         * @var ?LayoutDetectionConfig
         */
        public readonly ?LayoutDetectionConfig $layout = null,
        /**
         * Hardware acceleration configuration for ONNX Runtime models.
         *
         * Controls execution provider selection for layout detection and embedding
         * models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
         * Linux, CPU on Windows).
         *
         * @var ?AccelerationConfig
         */
        public readonly ?AccelerationConfig $acceleration = null,
        /**
         * Cache namespace for tenant isolation.
         *
         * When set, cache entries are stored under `{cache_dir}/{namespace}/`.
         * Must be alphanumeric, hyphens, or underscores only (max 64 chars).
         * Different namespaces have isolated cache spaces on the same filesystem.
         *
         * @var ?string
         */
        public readonly ?string $cacheNamespace = null,
        /**
         * Per-request cache TTL in seconds.
         *
         * Overrides the global `max_age_days` for this specific extraction.
         * When `0`, caching is completely skipped (no read or write).
         * When `None`, the global TTL applies.
         *
         * @var ?int
         */
        public readonly ?int $cacheTtlSecs = null,
        /**
         * Email extraction configuration (None = use defaults).
         *
         * Currently supports configuring the fallback codepage for MSG files
         * that do not specify one. See `EmailConfig` for details.
         *
         * @var ?EmailConfig
         */
        public readonly ?EmailConfig $email = null,
        /**
         * Concurrency limits for constrained environments (None = use defaults).
         *
         * Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
         * (when `max_concurrent_extractions` is unset) the batch concurrency
         * semaphore. See `ConcurrencyConfig` for details.
         *
         * @var ?string
         */
        public readonly ?string $concurrency = null,
        /**
         * Tree-sitter language pack configuration (None = tree-sitter disabled).
         *
         * When set, enables code file extraction using tree-sitter parsers.
         * Controls grammar download behavior and code analysis options.
         *
         * @var ?TreeSitterConfig
         */
        public readonly ?TreeSitterConfig $treeSitter = null,
        /**
         * Structured extraction via LLM (None = disabled).
         *
         * When set, the extracted document content is sent to an LLM with the
         * provided JSON schema. The structured response is stored in
         * `ExtractionResult::structured_output`.
         *
         * @var ?StructuredExtractionConfig
         */
        public readonly ?StructuredExtractionConfig $structuredExtraction = null,
        /**
         * Cancellation token for this extraction (None = no external cancellation).
         *
         * Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
         * from another thread / task to abort the extraction in progress. The extractor
         * checks the token at safe checkpoints (before lock acquisition, between pages,
         * between batch items) and returns [`KreuzbergError::Cancelled`] when set.
         *
         * The field is excluded from serialization because `CancellationToken` is a
         * runtime handle, not a configuration value.
         *
         * @var ?string
         */
        public readonly ?string $cancelToken = null
    ) { }
    public static function default(): ExtractionConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function needsImageProcessing(): bool
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Per-file extraction configuration overrides for batch processing.
 *
 * All fields are `Option<T>` — `None` means "use the batch-level default."
 * This type is used with `batch_extract_files` and
 * `batch_extract_bytes` to allow heterogeneous
 * extraction settings within a single batch.
 *
 * # Excluded Fields
 *
 * The following `ExtractionConfig` fields are batch-level only and
 * cannot be overridden per file:
 * - `max_concurrent_extractions` — controls batch parallelism
 * - `use_cache` — global caching policy
 * - `acceleration` — shared ONNX execution provider
 * - `security_limits` — global archive security policy
 *
 * # Example
 */
final class FileExtractionConfig
{
    public function __construct(
        /** @var ?bool Override quality post-processing for this file. */
        public readonly ?bool $enableQualityProcessing = null,
        /** @var ?OcrConfig Override OCR configuration for this file (None in the Option = use batch default). */
        public readonly ?OcrConfig $ocr = null,
        /** @var ?bool Override force OCR for this file. */
        public readonly ?bool $forceOcr = null,
        /** @var ?array<int> Override force OCR pages for this file (1-indexed page numbers). */
        public readonly ?array $forceOcrPages = null,
        /** @var ?bool Override disable OCR for this file. */
        public readonly ?bool $disableOcr = null,
        /** @var ?ChunkingConfig Override chunking configuration for this file. */
        public readonly ?ChunkingConfig $chunking = null,
        /** @var ?ContentFilterConfig Override content filtering configuration for this file. */
        public readonly ?ContentFilterConfig $contentFilter = null,
        /** @var ?ImageExtractionConfig Override image extraction configuration for this file. */
        public readonly ?ImageExtractionConfig $images = null,
        /** @var ?PdfConfig Override PDF options for this file. */
        public readonly ?PdfConfig $pdfOptions = null,
        /** @var ?TokenReductionOptions Override token reduction for this file. */
        public readonly ?TokenReductionOptions $tokenReduction = null,
        /** @var ?LanguageDetectionConfig Override language detection for this file. */
        public readonly ?LanguageDetectionConfig $languageDetection = null,
        /** @var ?PageConfig Override page extraction for this file. */
        public readonly ?PageConfig $pages = null,
        /** @var ?KeywordConfig Override keyword extraction for this file. */
        public readonly ?KeywordConfig $keywords = null,
        /** @var ?PostProcessorConfig Override post-processor for this file. */
        public readonly ?PostProcessorConfig $postprocessor = null,
        /** @var ?string Override HTML conversion options for this file. */
        public readonly ?string $htmlOptions = null,
        /** @var ?ResultFormat Override result format for this file. */
        public readonly ?ResultFormat $resultFormat = null,
        /** @var ?OutputFormat Override output content format for this file. */
        public readonly ?OutputFormat $outputFormat = null,
        /** @var ?bool Override document structure output for this file. */
        public readonly ?bool $includeDocumentStructure = null,
        /** @var ?LayoutDetectionConfig Override layout detection for this file. */
        public readonly ?LayoutDetectionConfig $layout = null,
        /**
         * Override per-file extraction timeout in seconds.
         *
         * When set, the extraction for this file will be canceled after the
         * specified duration. A timed-out file produces an error result without
         * affecting other files in the batch.
         *
         * @var ?int
         */
        public readonly ?int $timeoutSecs = null,
        /** @var ?TreeSitterConfig Override tree-sitter configuration for this file. */
        public readonly ?TreeSitterConfig $treeSitter = null,
        /**
         * Override structured extraction configuration for this file.
         *
         * When set, enables LLM-based structured extraction with a JSON schema
         * for this specific file. The extracted content is sent to a VLM/LLM
         * and the response is parsed according to the provided schema.
         *
         * @var ?StructuredExtractionConfig
         */
        public readonly ?StructuredExtractionConfig $structuredExtraction = null
    ) { }
}

/**
 * Batch item for byte array extraction.
 *
 * Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
 * to represent a single item in a batch extraction job.
 */
final class BatchBytesItem
{
    public function __construct(
        /** @var string The content bytes to extract from */
        public readonly string $content,
        /** @var string MIME type of the content (e.g., "application/pdf", "text/html") */
        public readonly string $mimeType,
        /** @var ?FileExtractionConfig Per-item configuration overrides (None uses batch-level defaults) */
        public readonly ?FileExtractionConfig $config = null
    ) { }
}

/**
 * Batch item for file extraction.
 *
 * Used with `batch_extract_files` and `batch_extract_files_sync`
 * to represent a single file in a batch extraction job.
 */
final class BatchFileItem
{
    public function __construct(
        /** @var string Path to the file to extract from */
        public readonly string $path,
        /** @var ?FileExtractionConfig Per-file configuration overrides (None uses batch-level defaults) */
        public readonly ?FileExtractionConfig $config = null
    ) { }
}

/**
 * Image extraction configuration.
 */
final class ImageExtractionConfig
{
    public function __construct(
        /** @var bool Extract images from documents */
        public readonly bool $extractImages,
        /** @var int Target DPI for image normalization */
        public readonly int $targetDpi,
        /** @var int Maximum dimension for images (width or height) */
        public readonly int $maxImageDimension,
        /**
         * Whether to inject image reference placeholders into markdown output.
         * When `true` (default), image references like `![Image 1](embedded:p1_i0)`
         * are appended to the markdown. Set to `false` to extract images as data
         * without polluting the markdown output.
         *
         * @var bool
         */
        public readonly bool $injectPlaceholders,
        /** @var bool Automatically adjust DPI based on image content */
        public readonly bool $autoAdjustDpi,
        /** @var int Minimum DPI threshold */
        public readonly int $minDpi,
        /** @var int Maximum DPI threshold */
        public readonly int $maxDpi,
        /**
         * When `true` (default), extracted images are classified by kind and grouped
         * into clusters where they appear to belong to one figure.
         *
         * @var bool
         */
        public readonly bool $classify,
        /**
         * When `true`, full-page renders produced during OCR preprocessing are captured
         * and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
         *
         * **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
         * document-level OCR bypass is active (whole-document backend). When OCR is
         * enabled and this flag is set but the active backend skips per-page rendering,
         * a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
         *
         * Defaults to `false`. Enable when downstream consumers need page thumbnails
         * (e.g. citation previews, visual grounding).
         *
         * @var bool
         */
        public readonly bool $includePageRasters,
        /**
         * Run OCR on extracted images and include the recognized text in the document content.
         *
         * When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
         * are processed with the configured OCR backend. Set to `false` to extract images
         * without OCR processing, even when OCR is enabled.
         *
         * @var bool
         */
        public readonly bool $runOcrOnImages,
        /**
         * When `true`, image OCR results are rendered as plain text without the
         * `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
         * is also `true`.
         *
         * @var bool
         */
        public readonly bool $ocrTextOnly,
        /**
         * When `true` and `ocr_text_only` is `false`, append the OCR text after
         * the image placeholder in the rendered output.
         *
         * @var bool
         */
        public readonly bool $appendOcrText,
        /**
         * Maximum number of image objects to extract per PDF page.
         *
         * Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
         * can trigger extremely long or indefinite extraction times when every image
         * object on a dense page is decoded individually via the PDF extractor. Setting this
         * limit causes kreuzberg to stop collecting individual images once the count
         * per page reaches the cap and emit a warning instead.
         *
         * `None` (default) means no limit — all images are extracted.
         *
         * @var ?int
         */
        public readonly ?int $maxImagesPerPage = null
    ) { }
    public static function default(): ImageExtractionConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Token reduction configuration.
 */
final class TokenReductionOptions
{
    public function __construct(
        /** @var string Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
        public readonly string $mode,
        /** @var bool Preserve important words (capitalized, technical terms) */
        public readonly bool $preserveImportantWords
    ) { }
    public static function default(): TokenReductionOptions
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Language detection configuration.
 */
final class LanguageDetectionConfig
{
    public function __construct(
        /** @var bool Enable language detection */
        public readonly bool $enabled,
        /** @var float Minimum confidence threshold (0.0-1.0) */
        public readonly float $minConfidence,
        /** @var bool Detect multiple languages in the document */
        public readonly bool $detectMultiple
    ) { }
    public static function default(): LanguageDetectionConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Configuration for styled HTML output.
 *
 * When set on `ExtractionConfig.html_output` alongside
 * `output_format = OutputFormat.Html`, the pipeline builds a
 * `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
 * the plain comrak-based renderer.
 *
 * # Example
 */
final class HtmlOutputConfig
{
    public function __construct(
        /** @var HtmlTheme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
        public readonly HtmlTheme $theme,
        /**
         * CSS class prefix applied to every emitted class name.
         *
         * Default: `"kb-"`. Change this if your host application already uses
         * classes that start with `kb-`.
         *
         * @var string
         */
        public readonly string $classPrefix,
        /**
         * When `true` (default), write the resolved CSS into a `<style>` block
         * immediately after the opening `<div class="{prefix}doc">`.
         *
         * Set to `false` to emit only the structural markup and wire up your
         * own stylesheet targeting the `kb-*` class names.
         *
         * @var bool
         */
        public readonly bool $embedCss,
        /**
         * Inline CSS string injected into the output after the theme stylesheet.
         * Concatenated after `css_file` content when both are set.
         *
         * @var ?string
         */
        public readonly ?string $css = null,
        /**
         * Path to a CSS file loaded once at renderer construction time.
         * Concatenated before `css` when both are set.
         *
         * @var ?string
         */
        public readonly ?string $cssFile = null
    ) { }
    public static function default(): HtmlOutputConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Layout detection configuration.
 *
 * Controls layout detection behavior in the extraction pipeline.
 * When set on `ExtractionConfig`(super.ExtractionConfig), layout detection
 * is enabled for PDF extraction.
 */
final class LayoutDetectionConfig
{
    public function __construct(
        /** @var bool Whether to apply postprocessing heuristics (default: true). */
        public readonly bool $applyHeuristics,
        /**
         * Table structure recognition model.
         *
         * Controls which model is used for table cell detection within layout-detected
         * table regions. Defaults to [`TableModel::Tatr`].
         *
         * @var TableModel
         */
        public readonly TableModel $tableModel,
        /** @var ?float Confidence threshold override (None = use model default). */
        public readonly ?float $confidenceThreshold = null,
        /**
         * Hardware acceleration for ONNX models (layout detection + table structure).
         *
         * When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
         * is used for inference. Defaults to `None` (auto-select per platform).
         *
         * @var ?AccelerationConfig
         */
        public readonly ?AccelerationConfig $acceleration = null
    ) { }
    public static function default(): LayoutDetectionConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Configuration for an LLM provider/model via liter-llm.
 *
 * Each feature (VLM OCR, VLM embeddings, structured extraction) carries
 * its own `LlmConfig`, allowing different providers per feature.
 *
 * # Example
 *
 * ```toml
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * api_key = "sk-..."  # or use KREUZBERG_LLM_API_KEY env var
 * ```
 */
final class LlmConfig
{
    public function __construct(
        /**
         * Provider/model string using liter-llm routing format.
         *
         * Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
         * `"groq/llama-3.1-70b-versatile"`.
         *
         * @var string
         */
        public readonly string $model,
        /**
         * API key for the provider. When `None`, liter-llm falls back to
         * the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
         *
         * @var ?string
         */
        public readonly ?string $apiKey = null,
        /** @var ?string Custom base URL override for the provider endpoint. */
        public readonly ?string $baseUrl = null,
        /** @var ?int Request timeout in seconds (default: 60). */
        public readonly ?int $timeoutSecs = null,
        /** @var ?int Maximum retry attempts (default: 3). */
        public readonly ?int $maxRetries = null,
        /** @var ?float Sampling temperature for generation tasks. */
        public readonly ?float $temperature = null,
        /** @var ?int Maximum tokens to generate. */
        public readonly ?int $maxTokens = null
    ) { }
}

/**
 * Configuration for LLM-based structured data extraction.
 *
 * Sends extracted document content to a VLM with a JSON schema,
 * returning structured data that conforms to the schema.
 *
 * # Example
 *
 * ```toml
 * [structured_extraction]
 * schema_name = "invoice_data"
 * strict = true
 *
 * [structured_extraction.schema]
 * type = "object"
 * properties.vendor = { type = "string" }
 * properties.total = { type = "number" }
 * required = ["vendor", "total"]
 *
 * [structured_extraction.llm]
 * model = "openai/gpt-4o"
 * ```
 */
final class StructuredExtractionConfig
{
    public function __construct(
        /** @var string JSON Schema defining the desired output structure. */
        public readonly string $schema,
        /** @var string Schema name passed to the LLM's structured output mode. */
        public readonly string $schemaName,
        /** @var bool Enable strict mode — output must exactly match the schema. */
        public readonly bool $strict,
        /** @var LlmConfig LLM configuration for the extraction. */
        public readonly LlmConfig $llm,
        /** @var ?string Optional schema description for the LLM. */
        public readonly ?string $schemaDescription = null,
        /**
         * Custom Jinja2 extraction prompt template. When `None`, a default template is used.
         *
         * Available template variables:
         * - `{{ content }}` — The extracted document text.
         * - `{{ schema }}` — The JSON schema as a formatted string.
         * - `{{ schema_name }}` — The schema name.
         * - `{{ schema_description }}` — The schema description (may be empty).
         *
         * @var ?string
         */
        public readonly ?string $prompt = null
    ) { }
}

/**
 * Quality thresholds for OCR fallback decisions and pipeline quality gating.
 *
 * All fields default to the values that match the previous hardcoded behavior,
 * so `OcrQualityThresholds.default()` preserves existing semantics exactly.
 */
final class OcrQualityThresholds
{
    public function __construct(
        /** @var int Minimum total non-whitespace characters to consider text substantive. */
        public readonly int $minTotalNonWhitespace,
        /** @var float Minimum non-whitespace characters per page on average. */
        public readonly float $minNonWhitespacePerPage,
        /** @var int Minimum character count for a word to be "meaningful". */
        public readonly int $minMeaningfulWordLen,
        /** @var int Minimum count of meaningful words before text is accepted. */
        public readonly int $minMeaningfulWords,
        /** @var float Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
        public readonly float $minAlnumRatio,
        /** @var int Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
        public readonly int $minGarbageChars,
        /** @var float Maximum fraction of short (1-2 char) words before text is considered fragmented. */
        public readonly float $maxFragmentedWordRatio,
        /**
         * Critical fragmentation threshold — triggers OCR regardless of meaningful words.
         * Normal English text has ~20-30% short words. 80%+ is definitive garbage.
         *
         * @var float
         */
        public readonly float $criticalFragmentedWordRatio,
        /** @var float Minimum average word length. Below this with enough words indicates garbled extraction. */
        public readonly float $minAvgWordLength,
        /** @var int Minimum word count before average word length check applies. */
        public readonly int $minWordsForAvgLengthCheck,
        /** @var float Minimum consecutive word repetition ratio to detect column scrambling. */
        public readonly float $minConsecutiveRepeatRatio,
        /** @var int Minimum word count before consecutive repetition check is applied. */
        public readonly int $minWordsForRepeatCheck,
        /** @var int Minimum character count for "substantive markdown" OCR skip gate. */
        public readonly int $substantiveMinChars,
        /** @var int Minimum character count for "non-text content" OCR skip gate. */
        public readonly int $nonTextMinChars,
        /** @var float Alphanumeric+whitespace ratio threshold for skip decisions. */
        public readonly float $alnumWsRatioThreshold,
        /**
         * Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
         * If the result from a backend scores below this, try the next backend.
         *
         * @var float
         */
        public readonly float $pipelineMinQuality
    ) { }
    public static function default(): OcrQualityThresholds
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * A single backend stage in the OCR pipeline.
 */
final class OcrPipelineStage
{
    public function __construct(
        /** @var string Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
        public readonly string $backend,
        /** @var int Priority weight (higher = tried first). Stages are sorted by priority descending. */
        public readonly int $priority,
        /** @var ?string Language override for this stage (None = use parent OcrConfig.language). */
        public readonly ?string $language = null,
        /** @var ?TesseractConfig Tesseract-specific config override for this stage. */
        public readonly ?TesseractConfig $tesseractConfig = null,
        /** @var ?string PaddleOCR-specific config for this stage. */
        public readonly ?string $paddleOcrConfig = null,
        /** @var ?LlmConfig VLM config override for this pipeline stage. */
        public readonly ?LlmConfig $vlmConfig = null,
        /**
         * Arbitrary per-call options passed through to the backend unchanged.
         *
         * Backends that support runtime tuning (mode switching, preprocessing
         * flags, inference parameters, etc.) read this value and deserialize
         * the keys they care about. Keys unknown to the backend are silently
         * ignored, so options from different backends can coexist in the same
         * config without conflict.
         *
         * Example (custom backend):
         * ```json
         * { "mode": "fast", "enable_layout": true }
         * ```
         *
         * @var ?string
         */
        public readonly ?string $backendOptions = null
    ) { }
}

/**
 * Multi-backend OCR pipeline with quality-based fallback.
 *
 * Backends are tried in priority order (highest first). After each backend
 * produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
 * the result is accepted. Otherwise the next backend is tried.
 */
final class OcrPipelineConfig
{
    public function __construct(
        /** @var array<OcrPipelineStage> Ordered list of backends to try. Sorted by priority (descending) at runtime. */
        public readonly array $stages,
        /** @var OcrQualityThresholds Quality thresholds for deciding whether to accept a result or try the next backend. */
        public readonly OcrQualityThresholds $qualityThresholds
    ) { }
}

/**
 * OCR configuration.
 */
final class OcrConfig
{
    public function __construct(
        /**
         * Whether OCR is enabled.
         *
         * Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
         * [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
         * metadata only; PDFs use native text extraction without OCR fallback.
         *
         * Defaults to `true`. When `false`, all other OCR settings are ignored.
         *
         * @var bool
         */
        public readonly bool $enabled,
        /** @var string OCR backend: tesseract, easyocr, paddleocr */
        public readonly string $backend,
        /** @var string Language code (e.g., "eng", "deu") */
        public readonly string $language,
        /**
         * Enable automatic page rotation based on orientation detection.
         *
         * When enabled, uses Tesseract's `DetectOrientationScript()` to detect
         * page orientation (0/90/180/270 degrees) before OCR. If the page is
         * rotated with high confidence, the image is corrected before recognition.
         * This is critical for handling rotated scanned documents.
         *
         * @var bool
         */
        public readonly bool $autoRotate,
        /** @var ?TesseractConfig Tesseract-specific configuration (optional) */
        public readonly ?TesseractConfig $tesseractConfig = null,
        /** @var ?OutputFormat Output format for OCR results (optional, for format conversion) */
        public readonly ?OutputFormat $outputFormat = null,
        /** @var ?string PaddleOCR-specific configuration (optional, JSON passthrough) */
        public readonly ?string $paddleOcrConfig = null,
        /**
         * Arbitrary per-call options passed through to the backend unchanged.
         *
         * Custom OCR backends and built-in backends that support runtime tuning
         * can read this value and deserialize the keys they care about. Keys
         * unknown to the backend are silently ignored.
         *
         * This is the recommended extension point for per-call parameters that
         * are not covered by the typed fields above (e.g. mode switching,
         * preprocessing flags, inference batch size).
         *
         * **Scope:** when `pipeline` is `None`, this value is propagated to the
         * primary stage of the auto-constructed pipeline. When `pipeline` is
         * explicitly set, this field has **no effect** — the caller must set
         * `OcrPipelineStage.backend_options` directly on the relevant stage(s)
         * instead.
         *
         * Example:
         * ```json
         * { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
         * ```
         *
         * @var ?string
         */
        public readonly ?string $backendOptions = null,
        /** @var ?OcrElementConfig OCR element extraction configuration */
        public readonly ?OcrElementConfig $elementConfig = null,
        /**
         * Quality thresholds for the native-text-to-OCR fallback decision.
         * When None, uses compiled defaults (matching previous hardcoded behavior).
         *
         * @var ?OcrQualityThresholds
         */
        public readonly ?OcrQualityThresholds $qualityThresholds = null,
        /**
         * Multi-backend OCR pipeline configuration. When set, enables weighted
         * fallback across multiple OCR backends based on output quality.
         * When None, uses the single `backend` field (same as today).
         *
         * @var ?OcrPipelineConfig
         */
        public readonly ?OcrPipelineConfig $pipeline = null,
        /**
         * VLM (Vision Language Model) OCR configuration.
         *
         * Required when `backend` is `"vlm"`. Uses liter-llm to send page
         * images to a vision model for text extraction.
         *
         * @var ?LlmConfig
         */
        public readonly ?LlmConfig $vlmConfig = null,
        /**
         * Custom Jinja2 prompt template for VLM OCR.
         *
         * When `None`, uses the default template. Available variables:
         * - `{{ language }}` — The document language code (e.g., "eng", "deu").
         *
         * @var ?string
         */
        public readonly ?string $vlmPrompt = null,
        /**
         * Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
         *
         * Not user-configurable via config files — injected at runtime from
         * `ExtractionConfig::acceleration` before each `process_image` call.
         *
         * @var ?AccelerationConfig
         */
        public readonly ?AccelerationConfig $acceleration = null,
        /**
         * Caller-supplied Tesseract `traineddata` bytes per language code.
         *
         * Primary use case is the WASM build, which has no filesystem and cannot
         * download tessdata at runtime. Native builds typically rely on
         * `TessdataManager` and ignore this field. When present, the WASM
         * Tesseract backend prefers these bytes over its compile-time-bundled
         * English data.
         *
         * Skipped by serde to keep config files small — supply via the typed API
         * at runtime.
         *
         * @var ?array<string, string>
         */
        public readonly ?array $tessdataBytes = null
    ) { }
    public static function default(): OcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Page extraction and tracking configuration.
 *
 * Controls how pages are extracted, tracked, and represented in the extraction results.
 * When `None`, page tracking is disabled.
 *
 * Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
 * when page boundaries are available and chunking is configured.
 */
final class PageConfig
{
    public function __construct(
        /** @var bool Extract pages as separate array (ExtractionResult.pages) */
        public readonly bool $extractPages,
        /** @var bool Insert page markers in main content string */
        public readonly bool $insertPageMarkers,
        /**
         * Page marker format (use {page_num} placeholder)
         * Default: "\n\n<!-- PAGE {page_num} -->\n\n"
         *
         * @var string
         */
        public readonly string $markerFormat
    ) { }
    public static function default(): PageConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * PDF-specific configuration.
 */
final class PdfConfig
{
    public function __construct(
        /** @var bool Extract images from PDF */
        public readonly bool $extractImages,
        /**
         * Extract tables from PDF.
         *
         * When `true` (default), runs pdf_oxide's native grid detector and, if it
         * finds nothing, falls back to the heuristic text-layer reconstruction in
         * `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
         * both passes — `tables` will then be empty in the result.
         *
         * @var bool
         */
        public readonly bool $extractTables,
        /** @var bool Extract PDF metadata */
        public readonly bool $extractMetadata,
        /**
         * Extract PDF annotations (text notes, highlights, links, stamps).
         * Default: false
         *
         * @var bool
         */
        public readonly bool $extractAnnotations,
        /**
         * Allow single-column pseudo tables in extraction results.
         *
         * By default, tables with fewer than 2 columns (layout-guided) or 3 columns
         * (heuristic) are rejected. When `true`, the minimum column count is relaxed
         * to 1, allowing single-column structured data (glossaries, itemized lists)
         * to be emitted as tables. Other quality filters (density, sparsity, prose
         * detection) still apply.
         *
         * @var bool
         */
        public readonly bool $allowSingleColumnTables,
        /**
         * Perform OCR on inline images extracted from PDF pages and attach the
         * recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
         * to be available; if `ExtractionConfig.ocr` is `None` the extractor
         * falls back to `TesseractConfig::default()`. Per-image failures degrade
         * gracefully (the image is returned without OCR text rather than failing
         * the whole extraction). Default: `false`.
         *
         * @var bool
         */
        public readonly bool $ocrInlineImages,
        /** @var ?array<string> List of passwords to try when opening encrypted PDFs */
        public readonly ?array $passwords = null,
        /** @var ?HierarchyConfig Hierarchy extraction configuration (None = hierarchy extraction disabled) */
        public readonly ?HierarchyConfig $hierarchy = null,
        /**
         * Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
         * Default: 0.06 (6%)
         *
         * @var ?float
         */
        public readonly ?float $topMarginFraction = null,
        /**
         * Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
         * Default: 0.05 (5%)
         *
         * @var ?float
         */
        public readonly ?float $bottomMarginFraction = null
    ) { }
    public static function default(): PdfConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Hierarchy extraction configuration for PDF text structure analysis.
 *
 * Enables extraction of document hierarchy levels (H1-H6) based on font size
 * clustering and semantic analysis. When enabled, hierarchical blocks are
 * included in page content.
 */
final class HierarchyConfig
{
    public function __construct(
        /** @var bool Enable hierarchy extraction */
        public readonly bool $enabled,
        /**
         * Number of font size clusters to use for hierarchy levels (1-7)
         *
         * Default: 6, which provides H1-H6 heading levels with body text.
         * Larger values create more fine-grained hierarchy levels.
         *
         * @var int
         */
        public readonly int $kClusters,
        /** @var bool Include bounding box information in hierarchy blocks */
        public readonly bool $includeBbox,
        /**
         * OCR coverage threshold for smart OCR triggering (0.0-1.0)
         *
         * Determines when OCR should be triggered based on text block coverage.
         * OCR is triggered when text blocks cover less than this fraction of the page.
         * Default: 0.5 (trigger OCR if less than 50% of page has text)
         *
         * @var ?float
         */
        public readonly ?float $ocrCoverageThreshold = null
    ) { }
    public static function default(): HierarchyConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Post-processor configuration.
 */
final class PostProcessorConfig
{
    public function __construct(
        /** @var bool Enable post-processors */
        public readonly bool $enabled,
        /** @var ?array<string> Whitelist of processor names to run (None = all enabled) */
        public readonly ?array $enabledProcessors = null,
        /** @var ?array<string> Blacklist of processor names to skip (None = none disabled) */
        public readonly ?array $disabledProcessors = null,
        /** @var ?array<string> Pre-computed AHashSet for O(1) enabled processor lookup */
        public readonly ?array $enabledSet = null,
        /** @var ?array<string> Pre-computed AHashSet for O(1) disabled processor lookup */
        public readonly ?array $disabledSet = null
    ) { }
    public static function default(): PostProcessorConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Chunking configuration.
 *
 * Configures text chunking for document content, including chunk size,
 * overlap, trimming behavior, and optional embeddings.
 *
 * Use `..Default.default()` when constructing to allow for future field additions:
 */
final class ChunkingConfig
{
    public function __construct(
        /**
         * Maximum size per chunk (in units determined by `sizing`).
         *
         * When `sizing` is `Characters` (default), this is the max character count.
         * When using token-based sizing, this is the max token count.
         *
         * Default: 1000
         *
         * @var int
         */
        public readonly int $maxCharacters,
        /**
         * Overlap between chunks (in units determined by `sizing`).
         *
         * Default: 200
         *
         * @var int
         */
        public readonly int $overlap,
        /**
         * Whether to trim whitespace from chunk boundaries.
         *
         * Default: true
         *
         * @var bool
         */
        public readonly bool $trim,
        /**
         * Type of chunker to use (Text or Markdown).
         *
         * Default: Text
         *
         * @var ChunkerType
         */
        public readonly ChunkerType $chunkerType,
        /**
         * How to measure chunk size.
         *
         * Default: `Characters` (Unicode character count).
         * Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
         *
         * @var ChunkSizing
         */
        public readonly ChunkSizing $sizing,
        /**
         * When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
         * path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
         *
         * This is useful for RAG pipelines where each chunk needs self-contained
         * context about its position in the document structure.
         *
         * Default: `false`
         *
         * @var bool
         */
        public readonly bool $prependHeadingContext,
        /** @var ?EmbeddingConfig Optional embedding configuration for chunk embeddings. */
        public readonly ?EmbeddingConfig $embedding = null,
        /** @var ?string Use a preset configuration (overrides individual settings if provided). */
        public readonly ?string $preset = null,
        /**
         * Optional cosine similarity threshold for semantic topic boundary detection.
         *
         * Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
         * provided. You almost never need to set this. When omitted, defaults to
         * `0.75` which works well for most documents. Lower values detect more
         * topic boundaries (more, smaller chunks); higher values detect fewer.
         * Range: `0.0..=1.0`.
         *
         * @var ?float
         */
        public readonly ?float $topicThreshold = null
    ) { }
    public static function default(): ChunkingConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Embedding configuration for text chunks.
 *
 * Configures embedding generation using ONNX models via the vendored embedding engine.
 * Requires the `embeddings` feature to be enabled.
 */
final class EmbeddingConfig
{
    public function __construct(
        /** @var EmbeddingModelType The embedding model to use (defaults to "balanced" preset if not specified) */
        public readonly EmbeddingModelType $model,
        /** @var bool Whether to normalize embedding vectors (recommended for cosine similarity) */
        public readonly bool $normalize,
        /** @var int Batch size for embedding generation */
        public readonly int $batchSize,
        /** @var bool Show model download progress */
        public readonly bool $showDownloadProgress,
        /**
         * Custom cache directory for model files
         *
         * Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
         * Allows full customization of model download location.
         *
         * @var ?string
         */
        public readonly ?string $cacheDir = null,
        /**
         * Hardware acceleration for the embedding ONNX model.
         *
         * When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
         * is used for inference. Defaults to `None` (auto-select per platform).
         *
         * @var ?AccelerationConfig
         */
        public readonly ?AccelerationConfig $acceleration = null,
        /**
         * Maximum wall-clock duration (in seconds) for a single `embed()` call when
         * using [`EmbeddingModelType::Plugin`].
         *
         * Applies only to the in-process plugin path — protects against hung
         * host-language backends (e.g. a Python callback deadlocked on the GIL,
         * a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
         * returns `Plugin` instead of blocking forever.
         *
         * `None` disables the timeout. The default (60 seconds) is conservative
         * for common in-process inference; increase for large batches on slow
         * hardware.
         *
         * @var ?int
         */
        public readonly ?int $maxEmbedDurationSecs = null
    ) { }
    public static function default(): EmbeddingConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Configuration for tree-sitter language pack integration.
 *
 * Controls grammar download behavior and code analysis options.
 *
 * # Example (TOML)
 *
 * ```toml
 * [tree_sitter]
 * languages = ["python", "rust"]
 * groups = ["web"]
 *
 * [tree_sitter.process]
 * structure = true
 * comments = true
 * docstrings = true
 * ```
 */
final class TreeSitterConfig
{
    public function __construct(
        /**
         * Enable code intelligence processing (default: true).
         *
         * When `false`, tree-sitter analysis is completely skipped even if
         * the config section is present.
         *
         * @var bool
         */
        public readonly bool $enabled,
        /** @var TreeSitterProcessConfig Processing options for code analysis. */
        public readonly TreeSitterProcessConfig $process,
        /**
         * Custom cache directory for downloaded grammars.
         *
         * When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
         *
         * @var ?string
         */
        public readonly ?string $cacheDir = null,
        /** @var ?array<string> Languages to pre-download on init (e.g., `["python", "rust"]`). */
        public readonly ?array $languages = null,
        /** @var ?array<string> Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
        public readonly ?array $groups = null
    ) { }
    public static function default(): TreeSitterConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Processing options for tree-sitter code analysis.
 *
 * Controls which analysis features are enabled when extracting code files.
 */
final class TreeSitterProcessConfig
{
    public function __construct(
        /** @var bool Extract structural items (functions, classes, structs, etc.). Default: true. */
        public readonly bool $structure,
        /** @var bool Extract import statements. Default: true. */
        public readonly bool $imports,
        /** @var bool Extract export statements. Default: true. */
        public readonly bool $exports,
        /** @var bool Extract comments. Default: false. */
        public readonly bool $comments,
        /** @var bool Extract docstrings. Default: false. */
        public readonly bool $docstrings,
        /** @var bool Extract symbol definitions. Default: false. */
        public readonly bool $symbols,
        /** @var bool Include parse diagnostics. Default: false. */
        public readonly bool $diagnostics,
        /** @var CodeContentMode Content rendering mode for code extraction. */
        public readonly CodeContentMode $contentMode,
        /** @var ?int Maximum chunk size in bytes. `None` disables chunking. */
        public readonly ?int $chunkMaxSize = null
    ) { }
    public static function default(): TreeSitterProcessConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * A supported document format entry.
 *
 * Represents a file extension and its corresponding MIME type that Kreuzberg can process.
 */
final class SupportedFormat
{
    public function __construct(
        /** @var string File extension (without leading dot), e.g., "pdf", "docx" */
        public readonly string $extension,
        /** @var string MIME type string, e.g., "application/pdf" */
        public readonly string $mimeType
    ) { }
}

/**
 * API server configuration.
 *
 * This struct holds all configuration options for the Kreuzberg API server,
 * including host/port settings, CORS configuration, and upload limits.
 *
 * # Defaults
 *
 * - `host`: "127.0.0.1" (localhost only)
 * - `port`: 8000
 * - `cors_origins`: empty vector (allows all origins)
 * - `max_request_body_bytes`: 104_857_600 (100 MB)
 * - `max_multipart_field_bytes`: 104_857_600 (100 MB)
 */
final class ServerConfig
{
    public function __construct(
        /** @var string Server host address (e.g., "127.0.0.1", "0.0.0.0") */
        public readonly string $host,
        /** @var int Server port number */
        public readonly int $port,
        /**
         * CORS allowed origins. Empty vector means allow all origins.
         *
         * If this is an empty vector, the server will accept requests from any origin.
         * If populated with specific origins (e.g., `"https://example.com"`), only
         * those origins will be allowed.
         *
         * @var array<string>
         */
        public readonly array $corsOrigins,
        /** @var int Maximum size of request body in bytes (default: 100 MB) */
        public readonly int $maxRequestBodyBytes,
        /** @var int Maximum size of multipart fields in bytes (default: 100 MB) */
        public readonly int $maxMultipartFieldBytes
    ) { }
    public static function default(): ServerConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function listenAddr(): string
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function corsAllowsAll(): bool
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function isOriginAllowed(string $origin): bool
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function maxRequestBodyMb(): int
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function maxMultipartFieldMb(): int
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

final class StructuredDataResult
{
    public function __construct(
        /** @var string */
        public readonly string $content,
        /** @var string */
        public readonly string $format,
        /** @var array<string, string> */
        public readonly array $metadata,
        /** @var array<string> */
        public readonly array $textFields
    ) { }
}

/**
 * Application properties from docProps/app.xml for DOCX
 *
 * Contains Word-specific document statistics and metadata.
 */
final class DocxAppProperties
{
    public function __construct(
        /** @var ?string Application name (e.g., "Microsoft Office Word") */
        public readonly ?string $application = null,
        /** @var ?string Application version */
        public readonly ?string $appVersion = null,
        /** @var ?string Template filename */
        public readonly ?string $template = null,
        /** @var ?int Total editing time in minutes */
        public readonly ?int $totalTime = null,
        /** @var ?int Number of pages */
        public readonly ?int $pages = null,
        /** @var ?int Number of words */
        public readonly ?int $words = null,
        /** @var ?int Number of characters (excluding spaces) */
        public readonly ?int $characters = null,
        /** @var ?int Number of characters (including spaces) */
        public readonly ?int $charactersWithSpaces = null,
        /** @var ?int Number of lines */
        public readonly ?int $lines = null,
        /** @var ?int Number of paragraphs */
        public readonly ?int $paragraphs = null,
        /** @var ?string Company name */
        public readonly ?string $company = null,
        /** @var ?int Document security level */
        public readonly ?int $docSecurity = null,
        /** @var ?bool Scale crop flag */
        public readonly ?bool $scaleCrop = null,
        /** @var ?bool Links up to date flag */
        public readonly ?bool $linksUpToDate = null,
        /** @var ?bool Shared document flag */
        public readonly ?bool $sharedDoc = null,
        /** @var ?bool Hyperlinks changed flag */
        public readonly ?bool $hyperlinksChanged = null
    ) { }
}

/**
 * Application properties from docProps/app.xml for XLSX
 *
 * Contains Excel-specific document metadata.
 */
final class XlsxAppProperties
{
    public function __construct(
        /** @var array<string> Worksheet names */
        public readonly array $worksheetNames,
        /** @var ?string Application name (e.g., "Microsoft Excel") */
        public readonly ?string $application = null,
        /** @var ?string Application version */
        public readonly ?string $appVersion = null,
        /** @var ?int Document security level */
        public readonly ?int $docSecurity = null,
        /** @var ?bool Scale crop flag */
        public readonly ?bool $scaleCrop = null,
        /** @var ?bool Links up to date flag */
        public readonly ?bool $linksUpToDate = null,
        /** @var ?bool Shared document flag */
        public readonly ?bool $sharedDoc = null,
        /** @var ?bool Hyperlinks changed flag */
        public readonly ?bool $hyperlinksChanged = null,
        /** @var ?string Company name */
        public readonly ?string $company = null
    ) { }
}

/**
 * Application properties from docProps/app.xml for PPTX
 *
 * Contains PowerPoint-specific document metadata.
 */
final class PptxAppProperties
{
    public function __construct(
        /** @var array<string> Slide titles */
        public readonly array $slideTitles,
        /** @var ?string Application name (e.g., "Microsoft Office PowerPoint") */
        public readonly ?string $application = null,
        /** @var ?string Application version */
        public readonly ?string $appVersion = null,
        /** @var ?int Total editing time in minutes */
        public readonly ?int $totalTime = null,
        /** @var ?string Company name */
        public readonly ?string $company = null,
        /** @var ?int Document security level */
        public readonly ?int $docSecurity = null,
        /** @var ?bool Scale crop flag */
        public readonly ?bool $scaleCrop = null,
        /** @var ?bool Links up to date flag */
        public readonly ?bool $linksUpToDate = null,
        /** @var ?bool Shared document flag */
        public readonly ?bool $sharedDoc = null,
        /** @var ?bool Hyperlinks changed flag */
        public readonly ?bool $hyperlinksChanged = null,
        /** @var ?int Number of slides */
        public readonly ?int $slides = null,
        /** @var ?int Number of notes */
        public readonly ?int $notes = null,
        /** @var ?int Number of hidden slides */
        public readonly ?int $hiddenSlides = null,
        /** @var ?int Number of multimedia clips */
        public readonly ?int $multimediaClips = null,
        /** @var ?string Presentation format (e.g., "Widescreen", "Standard") */
        public readonly ?string $presentationFormat = null
    ) { }
}

/**
 * Dublin Core metadata from docProps/core.xml
 *
 * Contains standard metadata fields defined by the Dublin Core standard
 * and Office-specific extensions.
 */
final class CoreProperties
{
    public function __construct(
        /** @var ?string Document title */
        public readonly ?string $title = null,
        /** @var ?string Document subject/topic */
        public readonly ?string $subject = null,
        /** @var ?string Document creator/author */
        public readonly ?string $creator = null,
        /** @var ?string Keywords or tags */
        public readonly ?string $keywords = null,
        /** @var ?string Document description/abstract */
        public readonly ?string $description = null,
        /** @var ?string User who last modified the document */
        public readonly ?string $lastModifiedBy = null,
        /** @var ?string Revision number */
        public readonly ?string $revision = null,
        /** @var ?string Creation timestamp (ISO 8601) */
        public readonly ?string $created = null,
        /** @var ?string Last modification timestamp (ISO 8601) */
        public readonly ?string $modified = null,
        /** @var ?string Document category */
        public readonly ?string $category = null,
        /** @var ?string Content status (Draft, Final, etc.) */
        public readonly ?string $contentStatus = null,
        /** @var ?string Document language */
        public readonly ?string $language = null,
        /** @var ?string Unique identifier */
        public readonly ?string $identifier = null,
        /** @var ?string Document version */
        public readonly ?string $version = null,
        /** @var ?string Last print timestamp (ISO 8601) */
        public readonly ?string $lastPrinted = null
    ) { }
}

/**
 * Configuration for security limits across extractors.
 *
 * All limits are intentionally conservative to prevent DoS attacks
 * while still supporting legitimate documents.
 */
final class SecurityLimits
{
    public function __construct(
        /** @var int Maximum uncompressed size for archives (500 MB) */
        public readonly int $maxArchiveSize,
        /** @var int Maximum compression ratio before flagging as potential bomb (100:1) */
        public readonly int $maxCompressionRatio,
        /** @var int Maximum number of files in archive (10,000) */
        public readonly int $maxFilesInArchive,
        /** @var int Maximum nesting depth for structures (100) */
        public readonly int $maxNestingDepth,
        /**
         * Maximum length of any single XML entity / attribute / token (1 MiB).
         * This is a per-token cap, NOT a total cap — billion-laughs class
         * attacks where a single entity expands to hundreds of MB are caught
         * here, while normal long text content (a paragraph, a CDATA block) is
         * caught by `max_content_size` instead.
         *
         * @var int
         */
        public readonly int $maxEntityLength,
        /** @var int Maximum string growth per document (100 MB) */
        public readonly int $maxContentSize,
        /** @var int Maximum iterations per operation */
        public readonly int $maxIterations,
        /** @var int Maximum XML depth (100 levels) */
        public readonly int $maxXmlDepth,
        /** @var int Maximum cells per table (100,000) */
        public readonly int $maxTableCells
    ) { }
    public static function default(): SecurityLimits
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

final class TokenReductionConfig
{
    public function __construct(
        /** @var ReductionLevel */
        public readonly ReductionLevel $level,
        /** @var bool */
        public readonly bool $preserveMarkdown,
        /** @var bool */
        public readonly bool $preserveCode,
        /** @var float */
        public readonly float $semanticThreshold,
        /** @var bool */
        public readonly bool $enableParallel,
        /** @var bool */
        public readonly bool $useSimd,
        /** @var array<string> */
        public readonly array $preservePatterns,
        /** @var bool */
        public readonly bool $enableSemanticClustering,
        /** @var ?string */
        public readonly ?string $languageHint = null,
        /** @var ?array<string, array<string>> */
        public readonly ?array $customStopwords = null,
        /** @var ?float */
        public readonly ?float $targetReduction = null
    ) { }
    public static function default(): TokenReductionConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * A PDF annotation extracted from a document page.
 */
final class PdfAnnotation
{
    public function __construct(
        /** @var PdfAnnotationType The type of annotation. */
        public readonly PdfAnnotationType $annotationType,
        /** @var int Page number where the annotation appears (1-indexed). */
        public readonly int $pageNumber,
        /** @var ?string Text content of the annotation (e.g., comment text, link URL). */
        public readonly ?string $content = null,
        /** @var ?BoundingBox Bounding box of the annotation on the page. */
        public readonly ?BoundingBox $boundingBox = null
    ) { }
}

/**
 * Comprehensive Djot document structure with semantic preservation.
 *
 * This type captures the full richness of Djot markup, including:
 * - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
 * - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
 * - Attributes (classes, IDs, key-value pairs)
 * - Links, images, footnotes
 * - Math expressions (inline and display)
 * - Tables with full structure
 *
 * Available when the `djot` feature is enabled.
 */
final class DjotContent
{
    public function __construct(
        /** @var string Plain text representation for backwards compatibility */
        public readonly string $plainText,
        /** @var array<FormattedBlock> Structured block-level content */
        public readonly array $blocks,
        /** @var Metadata Metadata from YAML frontmatter */
        public readonly Metadata $metadata,
        /** @var array<Table> Extracted tables as structured data */
        public readonly array $tables,
        /** @var array<DjotImage> Extracted images with metadata */
        public readonly array $images,
        /** @var array<DjotLink> Extracted links with URLs */
        public readonly array $links,
        /** @var array<Footnote> Footnote definitions */
        public readonly array $footnotes,
        /** @var array<string> Attributes mapped by element identifier (if present) */
        public readonly array $attributes
    ) { }
}

/**
 * Block-level element in a Djot document.
 *
 * Represents structural elements like headings, paragraphs, lists, code blocks, etc.
 */
final class FormattedBlock
{
    public function __construct(
        /** @var BlockType Type of block element */
        public readonly BlockType $blockType,
        /** @var array<InlineElement> Inline content within the block */
        public readonly array $inlineContent,
        /** @var array<FormattedBlock> Nested blocks for containers (blockquotes, list items, divs) */
        public readonly array $children,
        /** @var ?int Heading level (1-6) for headings, or nesting level for lists */
        public readonly ?int $level = null,
        /** @var ?string Element attributes (classes, IDs, key-value pairs) */
        public readonly ?string $attributes = null,
        /** @var ?string Language identifier for code blocks */
        public readonly ?string $language = null,
        /** @var ?string Raw code content for code blocks */
        public readonly ?string $code = null
    ) { }
}

/**
 * Inline element within a block.
 *
 * Represents text with formatting, links, images, etc.
 */
final class InlineElement
{
    public function __construct(
        /** @var InlineType Type of inline element */
        public readonly InlineType $elementType,
        /** @var string Text content */
        public readonly string $content,
        /** @var ?string Element attributes */
        public readonly ?string $attributes = null,
        /** @var ?array<string, string> Additional metadata (e.g., href for links, src/alt for images) */
        public readonly ?array $metadata = null
    ) { }
}

/**
 * Image element in Djot.
 */
final class DjotImage
{
    public function __construct(
        /** @var string Image source URL or path */
        public readonly string $src,
        /** @var string Alternative text */
        public readonly string $alt,
        /** @var ?string Optional title */
        public readonly ?string $title = null,
        /** @var ?string Element attributes */
        public readonly ?string $attributes = null
    ) { }
}

/**
 * Link element in Djot.
 */
final class DjotLink
{
    public function __construct(
        /** @var string Link URL */
        public readonly string $url,
        /** @var string Link text content */
        public readonly string $text,
        /** @var ?string Optional title */
        public readonly ?string $title = null,
        /** @var ?string Element attributes */
        public readonly ?string $attributes = null
    ) { }
}

/**
 * Footnote in Djot.
 */
final class Footnote
{
    public function __construct(
        /** @var string Footnote label */
        public readonly string $label,
        /** @var array<FormattedBlock> Footnote content blocks */
        public readonly array $content
    ) { }
}

/**
 * Top-level structured document representation.
 *
 * A flat array of nodes with index-based parent/child references forming a tree.
 * Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
 * to iterate over top-level content by layer.
 *
 * # Validation
 *
 * Call `validate()` after construction to verify all node indices are in bounds
 * and parent-child relationships are bidirectionally consistent.
 */
final class DocumentStructure
{
    public function __construct(
        /** @var array<DocumentNode> All nodes in document/reading order. */
        public readonly array $nodes,
        /**
         * Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
         *
         * Populated during derivation from the internal document representation.
         * Empty when no relationships are detected.
         *
         * @var array<DocumentRelationship>
         */
        public readonly array $relationships,
        /**
         * Sorted, deduplicated list of node type names present in this document.
         *
         * Each value is the snake_case `node_type` tag of the corresponding
         * [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
         *
         * Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
         * Empty until that method is called (internal construction paths call it
         * at the end of derivation).
         *
         * @var array<string>
         */
        public readonly array $nodeTypes,
        /**
         * Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
         *
         * Allows renderers to apply format-aware heuristics when converting
         * the document tree to output formats.
         *
         * @var ?string
         */
        public readonly ?string $sourceFormat = null
    ) { }
    public function finalizeNodeTypes(): void
    { }
    public function isEmpty(): bool
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public static function default(): DocumentStructure
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * A resolved relationship between two nodes in the document tree.
 */
final class DocumentRelationship
{
    public function __construct(
        /** @var int Source node index (the referencing node). */
        public readonly int $source,
        /** @var int Target node index (the referenced node). */
        public readonly int $target,
        /** @var RelationshipKind Semantic kind of the relationship. */
        public readonly RelationshipKind $kind
    ) { }
}

/**
 * A single node in the document tree.
 *
 * Each node has deterministic `id`, typed `content`, optional `parent`/`children`
 * for tree structure, and metadata like page number, bounding box, and content layer.
 */
final class DocumentNode
{
    public function __construct(
        /** @var string Deterministic identifier (hash of content + position). */
        public readonly string $id,
        /** @var NodeContent Node content — tagged enum, type-specific data only. */
        public readonly NodeContent $content,
        /** @var array<int> Child node indices in reading order. */
        public readonly array $children,
        /** @var ContentLayer Content layer classification. */
        public readonly ContentLayer $contentLayer,
        /**
         * Inline annotations (formatting, links) on this node's text content.
         *
         * Only meaningful for text-carrying nodes; empty for containers.
         *
         * @var array<TextAnnotation>
         */
        public readonly array $annotations,
        /** @var ?int Parent node index (`None` = root-level node). */
        public readonly ?int $parent = null,
        /** @var ?int Page number where this node starts (1-indexed). */
        public readonly ?int $page = null,
        /** @var ?int Page number where this node ends (for multi-page tables/sections). */
        public readonly ?int $pageEnd = null,
        /** @var ?BoundingBox Bounding box in document coordinates. */
        public readonly ?BoundingBox $bbox = null,
        /**
         * Format-specific key-value attributes.
         *
         * Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
         * LaTeX environment names, Excel cell formulas, slide layout names, etc.
         *
         * @var ?array<string, string>
         */
        public readonly ?array $attributes = null
    ) { }
}

/**
 * Structured table grid with cell-level metadata.
 *
 * Stores row/column dimensions and a flat list of cells with position info.
 */
final class TableGrid
{
    public function __construct(
        /** @var int Number of rows in the table. */
        public readonly int $rows,
        /** @var int Number of columns in the table. */
        public readonly int $cols,
        /** @var array<GridCell> All cells in row-major order. */
        public readonly array $cells
    ) { }
}

/**
 * Individual grid cell with position and span metadata.
 */
final class GridCell
{
    public function __construct(
        /** @var string Cell text content. */
        public readonly string $content,
        /** @var int Zero-indexed row position. */
        public readonly int $row,
        /** @var int Zero-indexed column position. */
        public readonly int $col,
        /** @var int Number of rows this cell spans. */
        public readonly int $rowSpan,
        /** @var int Number of columns this cell spans. */
        public readonly int $colSpan,
        /** @var bool Whether this is a header cell. */
        public readonly bool $isHeader,
        /** @var ?BoundingBox Bounding box for this cell (if available). */
        public readonly ?BoundingBox $bbox = null
    ) { }
}

/**
 * Inline text annotation — byte-range based formatting and links.
 *
 * Annotations reference byte offsets into the node's text content,
 * enabling precise identification of formatted regions.
 */
final class TextAnnotation
{
    public function __construct(
        /** @var int Start byte offset in the node's text content (inclusive). */
        public readonly int $start,
        /** @var int End byte offset in the node's text content (exclusive). */
        public readonly int $end,
        /** @var AnnotationKind Annotation type. */
        public readonly AnnotationKind $kind
    ) { }
}

/**
 * General extraction result used by the core extraction API.
 *
 * This is the main result type returned by all extraction functions.
 */
final class ExtractionResult
{
    public function __construct(
        /** @var string */
        public readonly string $content,
        /** @var string */
        public readonly string $mimeType,
        /** @var Metadata */
        public readonly Metadata $metadata,
        /** @var array<Table> */
        public readonly array $tables,
        /**
         * Non-fatal warnings collected during processing pipeline stages.
         *
         * Captures errors from optional pipeline features (embedding, chunking,
         * language detection, output formatting) that don't prevent extraction
         * but may indicate degraded results.
         * Previously stored as individual keys in `metadata.additional`.
         *
         * @var array<ProcessingWarning>
         */
        public readonly array $processingWarnings,
        /**
         * Extraction strategy used to produce the returned text.
         *
         * Populated when the extractor can reliably distinguish native text extraction,
         * OCR-only extraction, or mixed native/OCR output.
         *
         * @var ?ExtractionMethod
         */
        public readonly ?ExtractionMethod $extractionMethod = null,
        /** @var ?array<string> */
        public readonly ?array $detectedLanguages = null,
        /**
         * Text chunks when chunking is enabled.
         *
         * When chunking configuration is provided, the content is split into
         * overlapping chunks for efficient processing. Each chunk contains the text,
         * optional embeddings (if enabled), and metadata about its position.
         *
         * @var ?array<Chunk>
         */
        public readonly ?array $chunks = null,
        /**
         * Extracted images from the document.
         *
         * When image extraction is enabled via `ImageExtractionConfig`, this field
         * contains all images found in the document with their raw data and metadata.
         * Each image may optionally contain a nested `ocr_result` if OCR was performed.
         *
         * @var ?array<ExtractedImage>
         */
        public readonly ?array $images = null,
        /**
         * Per-page content when page extraction is enabled.
         *
         * When page extraction is configured, the document is split into per-page content
         * with tables and images mapped to their respective pages.
         *
         * @var ?array<PageContent>
         */
        public readonly ?array $pages = null,
        /**
         * Semantic elements when element-based result format is enabled.
         *
         * When result_format is set to ElementBased, this field contains semantic
         * elements with type classification, unique identifiers, and metadata for
         * Unstructured-compatible element-based processing.
         *
         * @var ?array<Element>
         */
        public readonly ?array $elements = null,
        /**
         * Rich Djot content structure (when extracting Djot documents).
         *
         * When extracting Djot documents with structured extraction enabled,
         * this field contains the full semantic structure including:
         * - Block-level elements with nesting
         * - Inline formatting with attributes
         * - Links, images, footnotes
         * - Math expressions
         * - Complete attribute information
         *
         * The `content` field still contains plain text for backward compatibility.
         *
         * Always `None` for non-Djot documents.
         *
         * @var ?DjotContent
         */
        public readonly ?DjotContent $djotContent = null,
        /**
         * OCR elements with full spatial and confidence metadata.
         *
         * When OCR is performed with element extraction enabled, this field contains
         * the structured representation of detected text including:
         * - Bounding geometry (rectangles or quadrilaterals)
         * - Confidence scores (detection and recognition)
         * - Rotation information
         * - Hierarchical relationships (Tesseract only)
         *
         * This field preserves all metadata that would otherwise be lost when
         * converting to plain text or markdown output formats.
         *
         * Only populated when `OcrElementConfig.include_elements` is true.
         *
         * @var ?array<OcrElement>
         */
        public readonly ?array $ocrElements = null,
        /**
         * Structured document tree (when document structure extraction is enabled).
         *
         * When `include_document_structure` is true in `ExtractionConfig`, this field
         * contains the full hierarchical representation of the document including:
         * - Heading-driven section nesting
         * - Table grids with cell-level metadata
         * - Content layer classification (body, header, footer, footnote)
         * - Inline text annotations (formatting, links)
         * - Bounding boxes and page numbers
         *
         * Independent of `result_format` — can be combined with Unified or ElementBased.
         *
         * @var ?DocumentStructure
         */
        public readonly ?DocumentStructure $document = null,
        /**
         * Extracted keywords when keyword extraction is enabled.
         *
         * When keyword extraction (RAKE or YAKE) is configured, this field contains
         * the extracted keywords with scores, algorithm info, and position data.
         * Previously stored in `metadata.additional["keywords"]`.
         *
         * @var ?array<Keyword>
         */
        public readonly ?array $extractedKeywords = null,
        /**
         * Document quality score from quality analysis.
         *
         * A value between 0.0 and 1.0 indicating the overall text quality.
         * Previously stored in `metadata.additional["quality_score"]`.
         *
         * @var ?float
         */
        public readonly ?float $qualityScore = null,
        /**
         * PDF annotations extracted from the document.
         *
         * When annotation extraction is enabled via `PdfConfig::extract_annotations`,
         * this field contains text notes, highlights, links, stamps, and other
         * annotations found in PDF documents.
         *
         * @var ?array<PdfAnnotation>
         */
        public readonly ?array $annotations = null,
        /**
         * Nested extraction results from archive contents.
         *
         * When extracting archives, each processable file inside produces its own
         * full extraction result. Set to `None` for non-archive formats.
         * Use `max_archive_depth` in config to control recursion depth.
         *
         * @var ?array<ArchiveEntry>
         */
        public readonly ?array $children = null,
        /**
         * URIs/links discovered during document extraction.
         *
         * Contains hyperlinks, image references, citations, email addresses, and
         * other URI-like references found in the document. Always extracted when
         * present in the source document.
         *
         * @var ?array<ExtractedUri>
         */
        public readonly ?array $uris = null,
        /**
         * Tracked changes embedded in the source document.
         *
         * Populated by per-format extractors that understand change-tracking
         * metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
         * …). Every extractor defaults to `None` until its format-specific
         * implementation is added. Extractors that do populate this field follow
         * the "accepted-changes" convention: inserted text is present in
         * `content`, deleted text is absent — the revision list is the separate
         * audit trail.
         *
         * @var ?array<DocumentRevision>
         */
        public readonly ?array $revisions = null,
        /**
         * Structured extraction output from LLM-based JSON schema extraction.
         *
         * When `structured_extraction` is configured in `ExtractionConfig`, the
         * extracted document content is sent to a VLM with the provided JSON schema.
         * The response is parsed and stored here as a JSON value matching the schema.
         *
         * @var ?string
         */
        public readonly ?string $structuredOutput = null,
        /**
         * Code intelligence results from tree-sitter analysis.
         *
         * Populated when extracting source code files with the `tree-sitter` feature.
         * Contains metrics, structural analysis, imports/exports, comments,
         * docstrings, symbols, diagnostics, and optionally chunked code segments.
         *
         * Stored as an opaque JSON value so that all language bindings (Go, Java,
         * C#, …) can deserialize it as a raw JSON object rather than a typed struct.
         * The underlying type is `tree_sitter_language_pack::ProcessResult`.
         *
         * @var ?string
         */
        public readonly ?string $codeIntelligence = null,
        /**
         * LLM token usage and cost data for all LLM calls made during this extraction.
         *
         * Contains one entry per LLM call. Multiple entries are produced when
         * VLM OCR, structured extraction, or LLM embeddings run during
         * the same extraction.
         *
         * `None` when no LLM was used.
         *
         * @var ?array<LlmUsage>
         */
        public readonly ?array $llmUsage = null,
        /**
         * Pre-rendered content in the requested output format.
         *
         * Populated during `derive_extraction_result` before tree derivation consumes
         * element data. `apply_output_format` swaps this into `content` at the end
         * of the pipeline, after post-processors have operated on plain text.
         *
         * @var ?string
         */
        public readonly ?string $formattedContent = null,
        /**
         * Structured hOCR document for the OCR+layout pipeline.
         *
         * When tesseract produces hOCR output, the parsed `InternalDocument` carries
         * paragraph structure with bounding boxes and confidence scores. The layout
         * classification step enriches these elements before final rendering.
         *
         * @var ?string
         */
        public readonly ?string $ocrInternalDocument = null
    ) { }
    public static function fromOcr(OcrExtractionResult $ocr): ExtractionResult
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * A single file extracted from an archive.
 *
 * When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
 * enabled, each processable file produces its own full `ExtractionResult`.
 */
final class ArchiveEntry
{
    public function __construct(
        /** @var string Archive-relative file path (e.g. "folder/document.pdf"). */
        public readonly string $path,
        /** @var string Detected MIME type of the file. */
        public readonly string $mimeType,
        /** @var ExtractionResult Full extraction result for this file. */
        public readonly ExtractionResult $result
    ) { }
}

/**
 * A non-fatal warning from a processing pipeline stage.
 *
 * Captures errors from optional features that don't prevent extraction
 * but may indicate degraded results.
 */
final class ProcessingWarning
{
    public function __construct(
        /**
         * The pipeline stage or feature that produced this warning
         * (e.g., "embedding", "chunking", "language_detection", "output_format").
         *
         * @var string
         */
        public readonly string $source,
        /** @var string Human-readable description of what went wrong. */
        public readonly string $message
    ) { }
}

/**
 * Token usage and cost data for a single LLM call made during extraction.
 *
 * Populated when VLM OCR, structured extraction, or LLM-based embeddings
 * are used. Multiple entries may be present when multiple LLM calls occur
 * within one extraction (e.g. VLM OCR + structured extraction).
 */
final class LlmUsage
{
    public function __construct(
        /** @var string The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
        public readonly string $model,
        /**
         * The pipeline stage that triggered this LLM call
         * (e.g. "vlm_ocr", "structured_extraction", "embeddings").
         *
         * @var string
         */
        public readonly string $source,
        /** @var ?int Number of input/prompt tokens consumed. */
        public readonly ?int $inputTokens = null,
        /** @var ?int Number of output/completion tokens generated. */
        public readonly ?int $outputTokens = null,
        /** @var ?int Total tokens (input + output). */
        public readonly ?int $totalTokens = null,
        /** @var ?float Estimated cost in USD based on the provider's published pricing. */
        public readonly ?float $estimatedCost = null,
        /** @var ?string Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
        public readonly ?string $finishReason = null
    ) { }
}

/**
 * A text chunk with optional embedding and metadata.
 *
 * Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
 * contains the text content, optional embedding vector (if embedding generation
 * is configured), and metadata about its position in the document.
 */
final class Chunk
{
    public function __construct(
        /** @var string The text content of this chunk. */
        public readonly string $content,
        /**
         * Semantic structural classification of this chunk.
         *
         * Assigned by the heuristic classifier based on content patterns and
         * heading context. Defaults to `ChunkType::Unknown` when no rule matches.
         *
         * @var ChunkType
         */
        public readonly ChunkType $chunkType,
        /** @var ChunkMetadata Metadata about this chunk's position and properties. */
        public readonly ChunkMetadata $metadata,
        /**
         * Optional embedding vector for this chunk.
         *
         * Only populated when `EmbeddingConfig` is provided in chunking configuration.
         * The dimensionality depends on the chosen embedding model.
         *
         * @var ?array<float>
         */
        public readonly ?array $embedding = null
    ) { }
}

/**
 * Heading context for a chunk within a Markdown document.
 *
 * Contains the heading hierarchy from document root to this chunk's section.
 */
final class HeadingContext
{
    public function __construct(
        /**
         * The heading hierarchy from document root to this chunk's section.
         * Index 0 is the outermost (h1), last element is the most specific.
         *
         * @var array<HeadingLevel>
         */
        public readonly array $headings
    ) { }
}

/**
 * A single heading in the hierarchy.
 */
final class HeadingLevel
{
    public function __construct(
        /** @var int Heading depth (1 = h1, 2 = h2, etc.) */
        public readonly int $level,
        /** @var string The text content of the heading. */
        public readonly string $text
    ) { }
}

/**
 * Metadata about a chunk's position in the original document.
 */
final class ChunkMetadata
{
    public function __construct(
        /** @var int Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
        public readonly int $byteStart,
        /** @var int Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
        public readonly int $byteEnd,
        /** @var int Zero-based index of this chunk in the document. */
        public readonly int $chunkIndex,
        /** @var int Total number of chunks in the document. */
        public readonly int $totalChunks,
        /**
         * Indices into `ExtractionResult.images` for images on pages covered by this chunk.
         *
         * Contains zero-based indices into the top-level `images` collection for every
         * image whose `page_number` falls within `[first_page, last_page]`.
         * Empty when image extraction is disabled or the chunk spans no pages with images.
         *
         * @var array<int>
         */
        public readonly array $imageIndices,
        /**
         * Number of tokens in this chunk (if available).
         *
         * This is calculated by the embedding model's tokenizer if embeddings are enabled.
         *
         * @var ?int
         */
        public readonly ?int $tokenCount = null,
        /**
         * First page number this chunk spans (1-indexed).
         *
         * Only populated when page tracking is enabled in extraction configuration.
         *
         * @var ?int
         */
        public readonly ?int $firstPage = null,
        /**
         * Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
         *
         * Only populated when page tracking is enabled in extraction configuration.
         *
         * @var ?int
         */
        public readonly ?int $lastPage = null,
        /**
         * Heading context when using Markdown chunker.
         *
         * Contains the heading hierarchy this chunk falls under.
         * Only populated when `ChunkerType::Markdown` is used.
         *
         * @var ?HeadingContext
         */
        public readonly ?HeadingContext $headingContext = null
    ) { }
}

/**
 * Extracted image from a document.
 *
 * Contains raw image data, metadata, and optional nested OCR results.
 * Raw bytes allow cross-language compatibility - users can convert to
 * PIL.Image (Python), Sharp (Node.js), or other formats as needed.
 */
final class ExtractedImage
{
    public function __construct(
        /**
         * Raw image data (PNG, JPEG, WebP, etc. bytes).
         * Uses `bytes::Bytes` for cheap cloning of large buffers.
         *
         * @var string
         */
        public readonly string $data,
        /**
         * Image format (e.g., "jpeg", "png", "webp")
         * Uses Cow<'static, str> to avoid allocation for static literals.
         *
         * @var string
         */
        public readonly string $format,
        /** @var int Zero-indexed position of this image in the document/page */
        public readonly int $imageIndex,
        /** @var bool Whether this image is a mask image */
        public readonly bool $isMask,
        /** @var ?int Page/slide number where image was found (1-indexed) */
        public readonly ?int $pageNumber = null,
        /** @var ?int Image width in pixels */
        public readonly ?int $width = null,
        /** @var ?int Image height in pixels */
        public readonly ?int $height = null,
        /** @var ?string Colorspace information (e.g., "RGB", "CMYK", "Gray") */
        public readonly ?string $colorspace = null,
        /** @var ?int Bits per color component (e.g., 8, 16) */
        public readonly ?int $bitsPerComponent = null,
        /** @var ?string Optional description of the image */
        public readonly ?string $description = null,
        /**
         * Nested OCR extraction result (if image was OCRed)
         *
         * When OCR is performed on this image, the result is embedded here
         * rather than in a separate collection, making the relationship explicit.
         *
         * @var ?ExtractionResult
         */
        public readonly ?ExtractionResult $ocrResult = null,
        /**
         * Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
         * Only populated for PDF-extracted images when position data is available from the PDF extractor.
         *
         * @var ?BoundingBox
         */
        public readonly ?BoundingBox $boundingBox = null,
        /**
         * Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
         * Used for rendering image references when the binary data is not extracted.
         *
         * @var ?string
         */
        public readonly ?string $sourcePath = null,
        /**
         * Heuristic classification of what this image likely depicts.
         * `None` if classification was disabled or inconclusive.
         *
         * @var ?ImageKind
         */
        public readonly ?ImageKind $imageKind = null,
        /** @var ?float Confidence score for `image_kind`, in the range 0.0 to 1.0. */
        public readonly ?float $kindConfidence = null,
        /**
         * Identifier shared across images that form a single logical figure
         * (e.g. all raster tiles of one technical drawing). `None` for singletons.
         *
         * @var ?int
         */
        public readonly ?int $clusterId = null
    ) { }
}

/**
 * Bounding box coordinates for element positioning.
 */
final class BoundingBox
{
    public function __construct(
        /** @var float Left x-coordinate */
        public readonly float $x0,
        /** @var float Bottom y-coordinate */
        public readonly float $y0,
        /** @var float Right x-coordinate */
        public readonly float $x1,
        /** @var float Top y-coordinate */
        public readonly float $y1
    ) { }
}

/**
 * Metadata for a semantic element.
 */
final class ElementMetadata
{
    public function __construct(
        /** @var array<string, string> Additional custom metadata */
        public readonly array $additional,
        /** @var ?int Page number (1-indexed) */
        public readonly ?int $pageNumber = null,
        /** @var ?string Source filename or document name */
        public readonly ?string $filename = null,
        /** @var ?BoundingBox Bounding box coordinates if available */
        public readonly ?BoundingBox $coordinates = null,
        /** @var ?int Position index in the element sequence */
        public readonly ?int $elementIndex = null
    ) { }
}

/**
 * Semantic element extracted from document.
 *
 * Represents a logical unit of content with semantic classification,
 * unique identifier, and metadata for tracking origin and position.
 */
final class Element
{
    public function __construct(
        /** @var string Unique element identifier */
        public readonly string $elementId,
        /** @var ElementType Semantic type of this element */
        public readonly ElementType $elementType,
        /** @var string Text content of the element */
        public readonly string $text,
        /** @var ElementMetadata Metadata about the element */
        public readonly ElementMetadata $metadata
    ) { }
}

/**
 * Excel workbook representation.
 *
 * Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
 * extracted content and metadata.
 */
final class ExcelWorkbook
{
    public function __construct(
        /** @var array<ExcelSheet> All sheets in the workbook */
        public readonly array $sheets,
        /** @var array<string, string> Workbook-level metadata (author, creation date, etc.) */
        public readonly array $metadata,
        /**
         * Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
         *
         * Populated for legacy shared-workbook `.xlsx` files that contain the
         * `xl/revisions/` directory. Each `<header>` element maps to one
         * `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
         * (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
         * `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
         * follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
         *
         * @var ?array<DocumentRevision>
         */
        public readonly ?array $revisions = null
    ) { }
}

/**
 * Single Excel worksheet.
 *
 * Represents one sheet from an Excel workbook with its content
 * converted to Markdown format and dimensional statistics.
 */
final class ExcelSheet
{
    public function __construct(
        /** @var string Sheet name as it appears in Excel */
        public readonly string $name,
        /** @var string Sheet content converted to Markdown tables */
        public readonly string $markdown,
        /** @var int Number of rows */
        public readonly int $rowCount,
        /** @var int Number of columns */
        public readonly int $colCount,
        /** @var int Total number of non-empty cells */
        public readonly int $cellCount,
        /**
         * Pre-extracted table cells (2D vector of cell values)
         * Populated during markdown generation to avoid re-parsing markdown.
         * None for empty sheets.
         *
         * @var ?array<array<string>>
         */
        public readonly ?array $tableCells = null
    ) { }
}

/**
 * XML extraction result.
 *
 * Contains extracted text content from XML files along with
 * structural statistics about the XML document.
 */
final class XmlExtractionResult
{
    public function __construct(
        /** @var string Extracted text content (XML structure filtered out) */
        public readonly string $content,
        /** @var int Total number of XML elements processed */
        public readonly int $elementCount,
        /** @var array<string> List of unique element names found (sorted) */
        public readonly array $uniqueElements
    ) { }
}

/**
 * Plain text and Markdown extraction result.
 *
 * Contains the extracted text along with statistics and,
 * for Markdown files, structural elements like headers and links.
 */
final class TextExtractionResult
{
    public function __construct(
        /** @var string Extracted text content */
        public readonly string $content,
        /** @var int Number of lines */
        public readonly int $lineCount,
        /** @var int Number of words */
        public readonly int $wordCount,
        /** @var int Number of characters */
        public readonly int $characterCount,
        /** @var ?array<string> Markdown headers (text only, Markdown files only) */
        public readonly ?array $headers = null,
        /** @var ?array<array<string>> Markdown links as (text, URL) tuples (Markdown files only) */
        public readonly ?array $links = null,
        /** @var ?array<array<string>> Code blocks as (language, code) tuples (Markdown files only) */
        public readonly ?array $codeBlocks = null
    ) { }
}

/**
 * PowerPoint (PPTX) extraction result.
 *
 * Contains extracted slide content, metadata, and embedded images/tables.
 */
final class PptxExtractionResult
{
    public function __construct(
        /** @var string Extracted text content from all slides */
        public readonly string $content,
        /** @var PptxMetadata Presentation metadata */
        public readonly PptxMetadata $metadata,
        /** @var int Total number of slides */
        public readonly int $slideCount,
        /** @var int Total number of embedded images */
        public readonly int $imageCount,
        /** @var int Total number of tables */
        public readonly int $tableCount,
        /** @var array<ExtractedImage> Extracted images from the presentation */
        public readonly array $images,
        /** @var array<string> Hyperlinks discovered in slides as (url, optional_label) pairs. */
        public readonly array $hyperlinks,
        /**
         * Office metadata extracted from docProps/core.xml and docProps/app.xml.
         *
         * Contains keys like "title", "author", "created_by", "subject", "keywords",
         * "modified_by", "created_at", "modified_at", etc.
         *
         * @var array<string, string>
         */
        public readonly array $officeMetadata,
        /** @var ?PageStructure Slide structure with boundaries (when page tracking is enabled) */
        public readonly ?PageStructure $pageStructure = null,
        /** @var ?array<PageContent> Per-slide content (when page tracking is enabled) */
        public readonly ?array $pageContents = null,
        /** @var ?DocumentStructure Structured document representation */
        public readonly ?DocumentStructure $document = null,
        /**
         * Slide comments as revisions.
         *
         * Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
         * `DocumentRevision { kind: Comment }` with author (resolved from
         * `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
         * `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
         *
         * @var ?array<DocumentRevision>
         */
        public readonly ?array $revisions = null
    ) { }
}

/**
 * Email extraction result.
 *
 * Complete representation of an extracted email message (.eml or .msg)
 * including headers, body content, and attachments.
 */
final class EmailExtractionResult
{
    public function __construct(
        /** @var array<string> Primary recipient email addresses */
        public readonly array $toEmails,
        /** @var array<string> CC recipient email addresses */
        public readonly array $ccEmails,
        /** @var array<string> BCC recipient email addresses */
        public readonly array $bccEmails,
        /** @var string Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
        public readonly string $content,
        /** @var array<EmailAttachment> List of email attachments */
        public readonly array $attachments,
        /** @var array<string, string> Additional email headers and metadata */
        public readonly array $metadata,
        /** @var ?string Email subject line */
        public readonly ?string $subject = null,
        /** @var ?string Sender email address */
        public readonly ?string $fromEmail = null,
        /** @var ?string Email date/timestamp */
        public readonly ?string $date = null,
        /** @var ?string Message-ID header value */
        public readonly ?string $messageId = null,
        /** @var ?string Plain text version of the email body */
        public readonly ?string $plainText = null,
        /** @var ?string HTML version of the email body */
        public readonly ?string $htmlContent = null
    ) { }
}

/**
 * Email attachment representation.
 *
 * Contains metadata and optionally the content of an email attachment.
 */
final class EmailAttachment
{
    public function __construct(
        /** @var bool Whether this attachment is an image */
        public readonly bool $isImage,
        /** @var ?string Attachment name (from Content-Disposition header) */
        public readonly ?string $name = null,
        /** @var ?string Filename of the attachment */
        public readonly ?string $filename = null,
        /** @var ?string MIME type of the attachment */
        public readonly ?string $mimeType = null,
        /** @var ?int Size in bytes */
        public readonly ?int $size = null,
        /**
         * Attachment data (if extracted).
         * Uses `bytes::Bytes` for cheap cloning of large buffers.
         *
         * @var ?string
         */
        public readonly ?string $data = null
    ) { }
}

/**
 * OCR extraction result.
 *
 * Result of performing OCR on an image or scanned document,
 * including recognized text and detected tables.
 */
final class OcrExtractionResult
{
    public function __construct(
        /** @var string Recognized text content */
        public readonly string $content,
        /** @var string Original MIME type of the processed image */
        public readonly string $mimeType,
        /** @var array<string, string> OCR processing metadata (confidence scores, language, etc.) */
        public readonly array $metadata,
        /** @var array<OcrTable> Tables detected and extracted via OCR */
        public readonly array $tables,
        /**
         * Structured OCR elements with bounding boxes and confidence scores.
         * Available when TSV output is requested or table detection is enabled.
         *
         * @var ?array<OcrElement>
         */
        public readonly ?array $ocrElements = null,
        /**
         * Structured document produced from hOCR parsing.
         * Carries paragraph structure, bounding boxes, and confidence scores
         * that the flattened `content` string discards.
         *
         * @var ?string
         */
        public readonly ?string $internalDocument = null
    ) { }
}

/**
 * Table detected via OCR.
 *
 * Represents a table structure recognized during OCR processing.
 */
final class OcrTable
{
    public function __construct(
        /** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
        public readonly array $cells,
        /** @var string Markdown representation of the table */
        public readonly string $markdown,
        /** @var int Page number where the table was found (1-indexed) */
        public readonly int $pageNumber,
        /** @var ?OcrTableBoundingBox Bounding box of the table in pixel coordinates (from OCR word positions). */
        public readonly ?OcrTableBoundingBox $boundingBox = null
    ) { }
}

/**
 * Bounding box for an OCR-detected table in pixel coordinates.
 */
final class OcrTableBoundingBox
{
    public function __construct(
        /** @var int Left x-coordinate (pixels) */
        public readonly int $left,
        /** @var int Top y-coordinate (pixels) */
        public readonly int $top,
        /** @var int Right x-coordinate (pixels) */
        public readonly int $right,
        /** @var int Bottom y-coordinate (pixels) */
        public readonly int $bottom
    ) { }
}

/**
 * Image preprocessing configuration for OCR.
 *
 * These settings control how images are preprocessed before OCR to improve
 * text recognition quality. Different preprocessing strategies work better
 * for different document types.
 */
final class ImagePreprocessingConfig
{
    public function __construct(
        /** @var int Target DPI for the image (300 is standard, 600 for small text). */
        public readonly int $targetDpi,
        /** @var bool Auto-detect and correct image rotation. */
        public readonly bool $autoRotate,
        /** @var bool Correct skew (tilted images). */
        public readonly bool $deskew,
        /** @var bool Remove noise from the image. */
        public readonly bool $denoise,
        /** @var bool Enhance contrast for better text visibility. */
        public readonly bool $contrastEnhance,
        /** @var string Binarization method: "otsu", "sauvola", "adaptive". */
        public readonly string $binarizationMethod,
        /** @var bool Invert colors (white text on black → black on white). */
        public readonly bool $invertColors
    ) { }
    public static function default(): ImagePreprocessingConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Tesseract OCR configuration.
 *
 * Provides fine-grained control over Tesseract OCR engine parameters.
 * Most users can use the defaults, but these settings allow optimization
 * for specific document types (invoices, handwriting, etc.).
 */
final class TesseractConfig
{
    public function __construct(
        /** @var string Language code (e.g., "eng", "deu", "fra") */
        public readonly string $language,
        /**
         * Page Segmentation Mode (0-13).
         *
         * Common values:
         * - 3: Fully automatic page segmentation (native default)
         * - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
         * - 11: Sparse text with no particular order
         *
         * @var int
         */
        public readonly int $psm,
        /** @var string Output format ("text" or "markdown") */
        public readonly string $outputFormat,
        /**
         * OCR Engine Mode (0-3).
         *
         * - 0: Legacy engine only
         * - 1: Neural nets (LSTM) only (usually best)
         * - 2: Legacy + LSTM
         * - 3: Default (based on what's available)
         *
         * @var int
         */
        public readonly int $oem,
        /**
         * Minimum confidence threshold (0.0-100.0).
         *
         * Words with confidence below this threshold may be rejected or flagged.
         *
         * @var float
         */
        public readonly float $minConfidence,
        /** @var bool Enable automatic table detection and reconstruction */
        public readonly bool $enableTableDetection,
        /** @var float Minimum confidence threshold for table detection (0.0-1.0) */
        public readonly float $tableMinConfidence,
        /** @var int Column threshold for table detection (pixels) */
        public readonly int $tableColumnThreshold,
        /** @var float Row threshold ratio for table detection (0.0-1.0) */
        public readonly float $tableRowThresholdRatio,
        /** @var bool Enable OCR result caching */
        public readonly bool $useCache,
        /** @var bool Use pre-adapted templates for character classification */
        public readonly bool $classifyUsePreAdaptedTemplates,
        /** @var bool Enable N-gram language model */
        public readonly bool $languageModelNgramOn,
        /** @var bool Don't reject good words during block-level processing */
        public readonly bool $tesseditDontBlkrejGoodWds,
        /** @var bool Don't reject good words during row-level processing */
        public readonly bool $tesseditDontRowrejGoodWds,
        /** @var bool Enable dictionary correction */
        public readonly bool $tesseditEnableDictCorrection,
        /** @var string Whitelist of allowed characters (empty = all allowed) */
        public readonly string $tesseditCharWhitelist,
        /** @var string Blacklist of forbidden characters (empty = none forbidden) */
        public readonly string $tesseditCharBlacklist,
        /** @var bool Use primary language params model */
        public readonly bool $tesseditUsePrimaryParamsModel,
        /** @var bool Variable-width space detection */
        public readonly bool $textordSpaceSizeIsVariable,
        /** @var bool Use adaptive thresholding method */
        public readonly bool $thresholdingMethod,
        /**
         * Image preprocessing configuration.
         *
         * Controls how images are preprocessed before OCR. Can significantly
         * improve quality for scanned documents or low-quality images.
         *
         * @var ?ImagePreprocessingConfig
         */
        public readonly ?ImagePreprocessingConfig $preprocessing = null
    ) { }
    public static function default(): TesseractConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Image preprocessing metadata.
 *
 * Tracks the transformations applied to an image during OCR preprocessing,
 * including DPI normalization, resizing, and resampling.
 */
final class ImagePreprocessingMetadata
{
    public function __construct(
        /** @var array<int> Original image dimensions (width, height) in pixels */
        public readonly array $originalDimensions,
        /** @var array<float> Original image DPI (horizontal, vertical) */
        public readonly array $originalDpi,
        /** @var int Target DPI from configuration */
        public readonly int $targetDpi,
        /** @var float Scaling factor applied to the image */
        public readonly float $scaleFactor,
        /** @var bool Whether DPI was auto-adjusted based on content */
        public readonly bool $autoAdjusted,
        /** @var int Final DPI after processing */
        public readonly int $finalDpi,
        /** @var string Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
        public readonly string $resampleMethod,
        /** @var bool Whether dimensions were clamped to max_image_dimension */
        public readonly bool $dimensionClamped,
        /** @var bool Whether resize was skipped (dimensions already optimal) */
        public readonly bool $skippedResize,
        /** @var ?array<int> New dimensions after resizing (if resized) */
        public readonly ?array $newDimensions = null,
        /** @var ?int Calculated optimal DPI (if auto_adjust_dpi enabled) */
        public readonly ?int $calculatedDpi = null,
        /** @var ?string Error message if resize failed */
        public readonly ?string $resizeError = null
    ) { }
}

/**
 * Extraction result metadata.
 *
 * Contains common fields applicable to all formats, format-specific metadata
 * via a discriminated union, and additional custom fields from postprocessors.
 */
final class Metadata
{
    public function __construct(
        /**
         * Whether OCR was used during extraction.
         *
         * Set to `true` whenever the extraction pipeline ran an OCR backend
         * (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
         * or fallback text. `false` means native text extraction was used exclusively.
         *
         * @var bool
         */
        public readonly bool $ocrUsed,
        /**
         * Additional custom fields from postprocessors.
         *
         * Serialized as a nested `"additional"` object (not flattened at root level).
         * Uses `Cow<'static, str>` keys so static string keys avoid allocation.
         *
         * @var array<string, string>
         */
        public readonly array $additional,
        /** @var ?string Document title */
        public readonly ?string $title = null,
        /** @var ?string Document subject or description */
        public readonly ?string $subject = null,
        /** @var ?array<string> Primary author(s) - always Vec for consistency */
        public readonly ?array $authors = null,
        /** @var ?array<string> Keywords/tags - always Vec for consistency */
        public readonly ?array $keywords = null,
        /** @var ?string Primary language (ISO 639 code) */
        public readonly ?string $language = null,
        /** @var ?string Creation timestamp (ISO 8601 format) */
        public readonly ?string $createdAt = null,
        /** @var ?string Last modification timestamp (ISO 8601 format) */
        public readonly ?string $modifiedAt = null,
        /** @var ?string User who created the document */
        public readonly ?string $createdBy = null,
        /** @var ?string User who last modified the document */
        public readonly ?string $modifiedBy = null,
        /** @var ?PageStructure Page/slide/sheet structure with boundaries */
        public readonly ?PageStructure $pages = null,
        /**
         * Format-specific metadata (discriminated union)
         *
         * Contains detailed metadata specific to the document format.
         * Serialized as a nested `"format"` object with a `format_type` discriminator field.
         *
         * @var ?FormatMetadata
         */
        public readonly ?FormatMetadata $format = null,
        /** @var ?ImagePreprocessingMetadata Image preprocessing metadata (when OCR preprocessing was applied) */
        public readonly ?ImagePreprocessingMetadata $imagePreprocessing = null,
        /** @var ?string JSON schema (for structured data extraction) */
        public readonly ?string $jsonSchema = null,
        /** @var ?ErrorMetadata Error metadata (for batch operations) */
        public readonly ?ErrorMetadata $error = null,
        /**
         * Extraction duration in milliseconds (for benchmarking).
         *
         * This field is populated by batch extraction to provide per-file timing
         * information. It's `None` for single-file extraction (which uses external timing).
         *
         * @var ?int
         */
        public readonly ?int $extractionDurationMs = null,
        /** @var ?string Document category (from frontmatter or classification). */
        public readonly ?string $category = null,
        /** @var ?array<string> Document tags (from frontmatter). */
        public readonly ?array $tags = null,
        /** @var ?string Document version string (from frontmatter). */
        public readonly ?string $documentVersion = null,
        /** @var ?string Abstract or summary text (from frontmatter). */
        public readonly ?string $abstractText = null,
        /**
         * Output format identifier (e.g., "markdown", "html", "text").
         *
         * Set by the output format pipeline stage when format conversion is applied.
         * Previously stored in `metadata.additional["output_format"]`.
         *
         * @var ?string
         */
        public readonly ?string $outputFormat = null
    ) { }
    public function isEmpty(): bool
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Excel/spreadsheet format metadata.
 *
 * Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
 * discriminant. Sheet count and sheet names are stored inside this struct.
 */
final class ExcelMetadata
{
    public function __construct(
        /** @var ?int Number of sheets in the workbook. */
        public readonly ?int $sheetCount = null,
        /** @var ?array<string> Names of all sheets in the workbook. */
        public readonly ?array $sheetNames = null
    ) { }
}

/**
 * Email metadata extracted from .eml and .msg files.
 *
 * Includes sender/recipient information, message ID, and attachment list.
 */
final class EmailMetadata
{
    public function __construct(
        /** @var array<string> Primary recipients */
        public readonly array $toEmails,
        /** @var array<string> CC recipients */
        public readonly array $ccEmails,
        /** @var array<string> BCC recipients */
        public readonly array $bccEmails,
        /** @var array<string> List of attachment filenames */
        public readonly array $attachments,
        /** @var ?string Sender's email address */
        public readonly ?string $fromEmail = null,
        /** @var ?string Sender's display name */
        public readonly ?string $fromName = null,
        /** @var ?string Message-ID header value */
        public readonly ?string $messageId = null
    ) { }
}

/**
 * Archive (ZIP/TAR/7Z) metadata.
 *
 * Extracted from compressed archive files containing file lists and size information.
 */
final class ArchiveMetadata
{
    public function __construct(
        /** @var string Archive format ("ZIP", "TAR", "7Z", etc.) */
        public readonly string $format,
        /** @var int Total number of files in the archive */
        public readonly int $fileCount,
        /** @var array<string> List of file paths within the archive */
        public readonly array $fileList,
        /** @var int Total uncompressed size in bytes */
        public readonly int $totalSize,
        /** @var ?int Compressed size in bytes (if available) */
        public readonly ?int $compressedSize = null
    ) { }
}

/**
 * Image metadata extracted from image files.
 *
 * Includes dimensions, format, and EXIF data.
 */
final class ImageMetadata
{
    public function __construct(
        /** @var int Image width in pixels */
        public readonly int $width,
        /** @var int Image height in pixels */
        public readonly int $height,
        /** @var string Image format (e.g., "PNG", "JPEG", "TIFF") */
        public readonly string $format,
        /** @var array<string, string> EXIF metadata tags */
        public readonly array $exif
    ) { }
}

/**
 * XML metadata extracted during XML parsing.
 *
 * Provides statistics about XML document structure.
 */
final class XmlMetadata
{
    public function __construct(
        /** @var int Total number of XML elements processed */
        public readonly int $elementCount,
        /** @var array<string> List of unique element tag names (sorted) */
        public readonly array $uniqueElements
    ) { }
}

/**
 * Text/Markdown metadata.
 *
 * Extracted from plain text and Markdown files. Includes word counts and,
 * for Markdown, structural elements like headers and links.
 */
final class TextMetadata
{
    public function __construct(
        /** @var int Number of lines in the document */
        public readonly int $lineCount,
        /** @var int Number of words */
        public readonly int $wordCount,
        /** @var int Number of characters */
        public readonly int $characterCount,
        /** @var ?array<string> Markdown headers (headings text only, for Markdown files) */
        public readonly ?array $headers = null,
        /** @var ?array<array<string>> Markdown links as (text, url) tuples (for Markdown files) */
        public readonly ?array $links = null,
        /** @var ?array<array<string>> Code blocks as (language, code) tuples (for Markdown files) */
        public readonly ?array $codeBlocks = null
    ) { }
}

/**
 * Header/heading element metadata.
 */
final class HeaderMetadata
{
    public function __construct(
        /** @var int Header level: 1 (h1) through 6 (h6) */
        public readonly int $level,
        /** @var string Normalized text content of the header */
        public readonly string $text,
        /** @var int Document tree depth at the header element */
        public readonly int $depth,
        /** @var int Byte offset in original HTML document */
        public readonly int $htmlOffset,
        /** @var ?string HTML id attribute if present */
        public readonly ?string $id = null
    ) { }
}

/**
 * Link element metadata.
 */
final class LinkMetadata
{
    public function __construct(
        /** @var string The href URL value */
        public readonly string $href,
        /** @var string Link text content (normalized) */
        public readonly string $text,
        /** @var LinkType Link type classification */
        public readonly LinkType $linkType,
        /** @var array<string> Rel attribute values */
        public readonly array $rel,
        /** @var array<array<string>> Additional attributes as key-value pairs */
        public readonly array $attributes,
        /** @var ?string Optional title attribute */
        public readonly ?string $title = null
    ) { }
}

/**
 * Image element metadata.
 */
final class ImageMetadataType
{
    public function __construct(
        /** @var string Image source (URL, data URI, or SVG content) */
        public readonly string $src,
        /** @var ImageType Image type classification */
        public readonly ImageType $imageType,
        /** @var array<array<string>> Additional attributes as key-value pairs */
        public readonly array $attributes,
        /** @var ?string Alternative text from alt attribute */
        public readonly ?string $alt = null,
        /** @var ?string Title attribute */
        public readonly ?string $title = null,
        /** @var ?array<int> Image dimensions as (width, height) if available */
        public readonly ?array $dimensions = null
    ) { }
}

/**
 * Structured data (Schema.org, microdata, RDFa) block.
 */
final class StructuredData
{
    public function __construct(
        /** @var StructuredDataType Type of structured data */
        public readonly StructuredDataType $dataType,
        /** @var string Raw JSON string representation */
        public readonly string $rawJson,
        /** @var ?string Schema type if detectable (e.g., "Article", "Event", "Product") */
        public readonly ?string $schemaType = null
    ) { }
}

/**
 * HTML metadata extracted from HTML documents.
 *
 * Includes document-level metadata, Open Graph data, Twitter Card metadata,
 * and extracted structural elements (headers, links, images, structured data).
 */
final class HtmlMetadata
{
    public function __construct(
        /** @var array<string> Document keywords from `<meta name="keywords">` tag, split on commas */
        public readonly array $keywords,
        /**
         * Open Graph metadata (og:* properties) for social media
         * Keys like "title", "description", "image", "url", etc.
         *
         * @var array<string, string>
         */
        public readonly array $openGraph,
        /**
         * Twitter Card metadata (twitter:* properties)
         * Keys like "card", "site", "creator", "title", "description", "image", etc.
         *
         * @var array<string, string>
         */
        public readonly array $twitterCard,
        /**
         * Additional meta tags not covered by specific fields
         * Keys are meta name/property attributes, values are content
         *
         * @var array<string, string>
         */
        public readonly array $metaTags,
        /** @var array<HeaderMetadata> Extracted header elements with hierarchy */
        public readonly array $headers,
        /** @var array<LinkMetadata> Extracted hyperlinks with type classification */
        public readonly array $links,
        /** @var array<ImageMetadataType> Extracted images with source and dimensions */
        public readonly array $images,
        /** @var array<StructuredData> Extracted structured data blocks */
        public readonly array $structuredData,
        /** @var ?string Document title from `<title>` tag */
        public readonly ?string $title = null,
        /** @var ?string Document description from `<meta name="description">` tag */
        public readonly ?string $description = null,
        /** @var ?string Document author from `<meta name="author">` tag */
        public readonly ?string $author = null,
        /** @var ?string Canonical URL from `<link rel="canonical">` tag */
        public readonly ?string $canonicalUrl = null,
        /** @var ?string Base URL from `<base href="">` tag for resolving relative URLs */
        public readonly ?string $baseHref = null,
        /** @var ?string Document language from `lang` attribute */
        public readonly ?string $language = null,
        /** @var ?TextDirection Document text direction from `dir` attribute */
        public readonly ?TextDirection $textDirection = null
    ) { }
}

/**
 * OCR processing metadata.
 *
 * Captures information about OCR processing configuration and results.
 */
final class OcrMetadata
{
    public function __construct(
        /** @var string OCR language code(s) used */
        public readonly string $language,
        /** @var int Tesseract Page Segmentation Mode (PSM) */
        public readonly int $psm,
        /** @var string Output format (e.g., "text", "hocr") */
        public readonly string $outputFormat,
        /** @var int Number of tables detected */
        public readonly int $tableCount,
        /** @var ?int */
        public readonly ?int $tableRows = null,
        /** @var ?int */
        public readonly ?int $tableCols = null
    ) { }
}

/**
 * Error metadata (for batch operations).
 */
final class ErrorMetadata
{
    public function __construct(
        /** @var string */
        public readonly string $errorType,
        /** @var string */
        public readonly string $message
    ) { }
}

/**
 * PowerPoint presentation metadata.
 *
 * Extracted from PPTX files containing slide counts and presentation details.
 */
final class PptxMetadata
{
    public function __construct(
        /** @var int Total number of slides in the presentation */
        public readonly int $slideCount,
        /** @var array<string> Names of slides (if available) */
        public readonly array $slideNames,
        /** @var ?int Number of embedded images */
        public readonly ?int $imageCount = null,
        /** @var ?int Number of tables */
        public readonly ?int $tableCount = null
    ) { }
}

/**
 * Word document metadata.
 *
 * Extracted from DOCX files using shared Office Open XML metadata extraction.
 * Integrates with `office_metadata` module for core/app/custom properties.
 */
final class DocxMetadata
{
    public function __construct(
        /**
         * Core properties from docProps/core.xml (Dublin Core metadata)
         *
         * Contains title, creator, subject, keywords, dates, etc.
         * Shared format across DOCX/PPTX/XLSX documents.
         *
         * @var ?CoreProperties
         */
        public readonly ?CoreProperties $coreProperties = null,
        /**
         * Application properties from docProps/app.xml (Word-specific statistics)
         *
         * Contains word count, page count, paragraph count, editing time, etc.
         * DOCX-specific variant of Office application properties.
         *
         * @var ?DocxAppProperties
         */
        public readonly ?DocxAppProperties $appProperties = null,
        /**
         * Custom properties from docProps/custom.xml (user-defined properties)
         *
         * Contains key-value pairs defined by users or applications.
         * Values can be strings, numbers, booleans, or dates.
         *
         * @var ?array<string, string>
         */
        public readonly ?array $customProperties = null
    ) { }
}

/**
 * CSV/TSV file metadata.
 */
final class CsvMetadata
{
    public function __construct(
        /** @var int */
        public readonly int $rowCount,
        /** @var int */
        public readonly int $columnCount,
        /** @var bool */
        public readonly bool $hasHeader,
        /** @var ?string */
        public readonly ?string $delimiter = null,
        /** @var ?array<string> */
        public readonly ?array $columnTypes = null
    ) { }
}

/**
 * BibTeX bibliography metadata.
 */
final class BibtexMetadata
{
    public function __construct(
        /** @var int Number of entries in the bibliography. */
        public readonly int $entryCount,
        /** @var array<string> */
        public readonly array $citationKeys,
        /** @var array<string> */
        public readonly array $authors,
        /** @var ?YearRange */
        public readonly ?YearRange $yearRange = null,
        /** @var ?array<string, int> */
        public readonly ?array $entryTypes = null
    ) { }
}

/**
 * Citation file metadata (RIS, PubMed, EndNote).
 */
final class CitationMetadata
{
    public function __construct(
        /** @var int */
        public readonly int $citationCount,
        /** @var array<string> */
        public readonly array $authors,
        /** @var array<string> */
        public readonly array $dois,
        /** @var array<string> */
        public readonly array $keywords,
        /** @var ?string */
        public readonly ?string $format = null,
        /** @var ?YearRange */
        public readonly ?YearRange $yearRange = null
    ) { }
}

/**
 * Year range for bibliographic metadata.
 */
final class YearRange
{
    public function __construct(
        /** @var array<int> */
        public readonly array $years,
        /** @var ?int */
        public readonly ?int $min = null,
        /** @var ?int */
        public readonly ?int $max = null
    ) { }
}

/**
 * FictionBook (FB2) metadata.
 */
final class FictionBookMetadata
{
    public function __construct(
        /** @var array<string> */
        public readonly array $genres,
        /** @var array<string> */
        public readonly array $sequences,
        /** @var ?string */
        public readonly ?string $annotation = null
    ) { }
}

/**
 * dBASE (DBF) file metadata.
 */
final class DbfMetadata
{
    public function __construct(
        /** @var int */
        public readonly int $recordCount,
        /** @var int */
        public readonly int $fieldCount,
        /** @var array<DbfFieldInfo> */
        public readonly array $fields
    ) { }
}

/**
 * dBASE field information.
 */
final class DbfFieldInfo
{
    public function __construct(
        /** @var string */
        public readonly string $name,
        /** @var string */
        public readonly string $fieldType
    ) { }
}

/**
 * JATS (Journal Article Tag Suite) metadata.
 */
final class JatsMetadata
{
    public function __construct(
        /** @var array<string, string> */
        public readonly array $historyDates,
        /** @var array<ContributorRole> */
        public readonly array $contributorRoles,
        /** @var ?string */
        public readonly ?string $copyright = null,
        /** @var ?string */
        public readonly ?string $license = null
    ) { }
}

/**
 * JATS contributor with role.
 */
final class ContributorRole
{
    public function __construct(
        /** @var string */
        public readonly string $name,
        /** @var ?string */
        public readonly ?string $role = null
    ) { }
}

/**
 * EPUB metadata (Dublin Core extensions).
 */
final class EpubMetadata
{
    public function __construct(
        /** @var ?string */
        public readonly ?string $coverage = null,
        /** @var ?string */
        public readonly ?string $dcFormat = null,
        /** @var ?string */
        public readonly ?string $relation = null,
        /** @var ?string */
        public readonly ?string $source = null,
        /** @var ?string */
        public readonly ?string $dcType = null,
        /** @var ?string */
        public readonly ?string $coverImage = null
    ) { }
}

/**
 * Outlook PST archive metadata.
 */
final class PstMetadata
{
    public function __construct(
        /** @var int */
        public readonly int $messageCount
    ) { }
}

/**
 * Confidence scores for an OCR element.
 *
 * Separates detection confidence (how confident that text exists at this location)
 * from recognition confidence (how confident about the actual text content).
 */
final class OcrConfidence
{
    public function __construct(
        /**
         * Recognition confidence: how confident about the text content.
         *
         * Range: 0.0 to 1.0.
         *
         * @var float
         */
        public readonly float $recognition,
        /**
         * Detection confidence: how confident the OCR engine is that text exists here.
         *
         * PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
         * Range: 0.0 to 1.0 (or None if not available).
         *
         * @var ?float
         */
        public readonly ?float $detection = null
    ) { }
}

/**
 * Rotation information for an OCR element.
 */
final class OcrRotation
{
    public function __construct(
        /** @var float Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
        public readonly float $angleDegrees,
        /** @var ?float Confidence score for the rotation detection. */
        public readonly ?float $confidence = null
    ) { }
}

/**
 * A unified OCR element representing detected text with full metadata.
 *
 * This is the primary type for structured OCR output, preserving all information
 * from both Tesseract and PaddleOCR backends.
 */
final class OcrElement
{
    public function __construct(
        /** @var string The recognized text content. */
        public readonly string $text,
        /** @var OcrBoundingGeometry Bounding geometry (rectangle or quadrilateral). */
        public readonly OcrBoundingGeometry $geometry,
        /** @var OcrConfidence Confidence scores for detection and recognition. */
        public readonly OcrConfidence $confidence,
        /** @var OcrElementLevel Hierarchical level (word, line, block, page). */
        public readonly OcrElementLevel $level,
        /** @var int Page number (1-indexed). */
        public readonly int $pageNumber,
        /** @var array<string, string> Backend-specific metadata that doesn't fit the unified schema. */
        public readonly array $backendMetadata,
        /** @var ?OcrRotation Rotation information (if detected). */
        public readonly ?OcrRotation $rotation = null,
        /**
         * Parent element ID for hierarchical relationships.
         *
         * Only used for Tesseract output which has word -> line -> block hierarchy.
         *
         * @var ?string
         */
        public readonly ?string $parentId = null
    ) { }
}

/**
 * Configuration for OCR element extraction.
 *
 * Controls how OCR elements are extracted and filtered.
 */
final class OcrElementConfig
{
    public function __construct(
        /**
         * Whether to include OCR elements in the extraction result.
         *
         * When true, the `ocr_elements` field in `ExtractionResult` will be populated.
         *
         * @var bool
         */
        public readonly bool $includeElements,
        /**
         * Minimum hierarchical level to include.
         *
         * Elements below this level (e.g., words when min_level is Line) will be excluded.
         *
         * @var OcrElementLevel
         */
        public readonly OcrElementLevel $minLevel,
        /**
         * Minimum recognition confidence threshold (0.0-1.0).
         *
         * Elements with confidence below this threshold will be filtered out.
         *
         * @var float
         */
        public readonly float $minConfidence,
        /**
         * Whether to build hierarchical relationships between elements.
         *
         * When true, `parent_id` fields will be populated based on spatial containment.
         * Only meaningful for Tesseract output.
         *
         * @var bool
         */
        public readonly bool $buildHierarchy
    ) { }
}

/**
 * Unified page structure for documents.
 *
 * Supports different page types (PDF pages, PPTX slides, Excel sheets)
 * with character offset boundaries for chunk-to-page mapping.
 */
final class PageStructure
{
    public function __construct(
        /** @var int Total number of pages/slides/sheets */
        public readonly int $totalCount,
        /** @var PageUnitType Type of paginated unit */
        public readonly PageUnitType $unitType,
        /**
         * Character offset boundaries for each page
         *
         * Maps character ranges in the extracted content to page numbers.
         * Used for chunk page range calculation.
         *
         * @var ?array<PageBoundary>
         */
        public readonly ?array $boundaries = null,
        /** @var ?array<PageInfo> Detailed per-page metadata (optional, only when needed) */
        public readonly ?array $pages = null
    ) { }
}

/**
 * Byte offset boundary for a page.
 *
 * Tracks where a specific page's content starts and ends in the main content string,
 * enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
 * at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
 */
final class PageBoundary
{
    public function __construct(
        /** @var int Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
        public readonly int $byteStart,
        /** @var int Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
        public readonly int $byteEnd,
        /** @var int Page number (1-indexed) */
        public readonly int $pageNumber
    ) { }
}

/**
 * Metadata for individual page/slide/sheet.
 *
 * Captures per-page information including dimensions, content counts,
 * and visibility state (for presentations).
 */
final class PageInfo
{
    public function __construct(
        /** @var int Page number (1-indexed) */
        public readonly int $number,
        /**
         * Whether this page contains non-trivial vector graphics (paths, shapes, curves)
         *
         * Indicates the presence of vector-drawn content such as charts, diagrams,
         * or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
         * invisible to `ExtractionResult.images` since they are not embedded as raster
         * XObjects. Set to `true` when path count exceeds a heuristic threshold,
         * signaling that downstream consumers may want to rasterize the page to
         * capture this content.
         *
         * Only populated for PDFs; `None` for other document types.
         *
         * @var bool
         */
        public readonly bool $hasVectorGraphics,
        /** @var ?string Page title (usually for presentations) */
        public readonly ?string $title = null,
        /** @var ?array<float> Dimensions in points (PDF) or pixels (images): (width, height) */
        public readonly ?array $dimensions = null,
        /** @var ?int Number of images on this page */
        public readonly ?int $imageCount = null,
        /** @var ?int Number of tables on this page */
        public readonly ?int $tableCount = null,
        /** @var ?bool Whether this page is hidden (e.g., in presentations) */
        public readonly ?bool $hidden = null,
        /**
         * Whether this page is blank (no meaningful text, no images, no tables)
         *
         * A page is considered blank if it has fewer than 3 non-whitespace characters
         * and contains no tables or images. This is useful for filtering out empty pages
         * in scanned documents or PDFs with blank separator pages.
         *
         * @var ?bool
         */
        public readonly ?bool $isBlank = null
    ) { }
}

/**
 * Content for a single page/slide.
 *
 * When page extraction is enabled, documents are split into per-page content
 * with associated tables and images mapped to each page.
 *
 * # Performance
 *
 * Uses Arc-wrapped tables and images for memory efficiency:
 * - `Vec<Arc<Table>>` enables zero-copy sharing of table data
 * - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
 * - Maintains exact JSON compatibility via custom Serialize/Deserialize
 *
 * This reduces memory overhead for documents with shared tables/images
 * by avoiding redundant copies during serialization.
 */
final class PageContent
{
    public function __construct(
        /** @var int Page number (1-indexed) */
        public readonly int $pageNumber,
        /** @var string Text content for this page */
        public readonly string $content,
        /**
         * Tables found on this page (uses Arc for memory efficiency)
         *
         * Serializes as Vec<Table> for JSON compatibility while maintaining
         * Arc semantics in-memory for zero-copy sharing.
         *
         * @var array<Table>
         */
        public readonly array $tables,
        /**
         * Indices into `ExtractionResult.images` for images found on this page.
         *
         * Each value is a zero-based index into the top-level `images` collection.
         * Only populated when `extract_images = true` in the extraction config.
         *
         * @var array<int>
         */
        public readonly array $imageIndices,
        /**
         * Hierarchy information for the page (when hierarchy extraction is enabled)
         *
         * Contains text hierarchy levels (H1-H6) extracted from the page content.
         *
         * @var ?PageHierarchy
         */
        public readonly ?PageHierarchy $hierarchy = null,
        /**
         * Whether this page is blank (no meaningful text content)
         *
         * Determined during extraction based on text content analysis.
         * A page is blank if it has fewer than 3 non-whitespace characters
         * and contains no tables or images.
         *
         * @var ?bool
         */
        public readonly ?bool $isBlank = null,
        /**
         * Layout detection regions for this page (when layout detection is enabled).
         *
         * Contains detected layout regions with class, confidence, bounding box,
         * and area fraction. Only populated when layout detection is configured.
         *
         * @var ?array<LayoutRegion>
         */
        public readonly ?array $layoutRegions = null,
        /**
         * Speaker notes for this slide (PPTX only).
         *
         * Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
         * Only populated when the source is a PPTX file and notes are present.
         *
         * @var ?string
         */
        public readonly ?string $speakerNotes = null,
        /**
         * Section name this slide belongs to (PPTX only).
         *
         * PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
         * `ppt/presentation.xml`). Only populated when the source is a PPTX file and
         * the slide belongs to a named section.
         *
         * @var ?string
         */
        public readonly ?string $sectionName = null,
        /**
         * Sheet name for this page (XLSX/ODS only).
         *
         * Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
         * sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
         * formats and for sheets with an empty name.
         *
         * @var ?string
         */
        public readonly ?string $sheetName = null
    ) { }
}

/**
 * A detected layout region on a page.
 *
 * When layout detection is enabled, each page may have layout regions
 * identifying different content types (text, pictures, tables, etc.)
 * with confidence scores and spatial positions.
 */
final class LayoutRegion
{
    public function __construct(
        /** @var string Layout class name (e.g. "picture", "table", "text", "section_header"). */
        public readonly string $className,
        /** @var float Confidence score from the layout detection model (0.0 to 1.0). */
        public readonly float $confidence,
        /** @var BoundingBox Bounding box in document coordinate space. */
        public readonly BoundingBox $boundingBox,
        /** @var float Fraction of the page area covered by this region (0.0 to 1.0). */
        public readonly float $areaFraction
    ) { }
}

/**
 * Page hierarchy structure containing heading levels and block information.
 *
 * Used when PDF text hierarchy extraction is enabled. Contains hierarchical
 * blocks with heading levels (H1-H6) for semantic document structure.
 */
final class PageHierarchy
{
    public function __construct(
        /** @var int Number of hierarchy blocks on this page */
        public readonly int $blockCount,
        /** @var array<HierarchicalBlock> Hierarchical blocks with heading levels */
        public readonly array $blocks
    ) { }
}

/**
 * A text block with hierarchy level assignment.
 *
 * Represents a block of text with semantic heading information extracted from
 * font size clustering and hierarchical analysis.
 */
final class HierarchicalBlock
{
    public function __construct(
        /** @var string The text content of this block */
        public readonly string $text,
        /** @var float The font size of the text in this block */
        public readonly float $fontSize,
        /**
         * The hierarchy level of this block (H1-H6 or Body)
         *
         * Levels correspond to HTML heading tags:
         * - "h1": Top-level heading
         * - "h2": Secondary heading
         * - "h3": Tertiary heading
         * - "h4": Quaternary heading
         * - "h5": Quinary heading
         * - "h6": Senary heading
         * - "body": Body text (no heading level)
         *
         * @var string
         */
        public readonly string $level,
        /**
         * Bounding box information for the block
         *
         * Contains coordinates as (left, top, right, bottom) in PDF units.
         *
         * @var ?array<float>
         */
        public readonly ?array $bbox = null
    ) { }
}

/**
 * A single changed cell within a table.
 *
 * Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate.diff` re-exports this type verbatim.
 */
final class CellChange
{
    public function __construct(
        /** @var int Zero-based row index. */
        public readonly int $row,
        /** @var int Zero-based column index. */
        public readonly int $col,
        /** @var string Value before the change. */
        public readonly string $from,
        /** @var string Value after the change. */
        public readonly string $to
    ) { }
}

/**
 * A single tracked change embedded in a document.
 *
 * Populated by per-format extractors that understand change-tracking metadata
 * (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
 * extractor defaults to `ExtractionResult.revisions = None` until a
 * format-specific implementation is added.
 */
final class DocumentRevision
{
    public function __construct(
        /**
         * Format-specific revision identifier.
         *
         * For DOCX this is the `w:id` attribute value on the change element
         * (e.g. `"42"`). When the attribute is absent a synthetic fallback is
         * generated (`"docx-ins-0"`, `"docx-del-3"`, …).
         *
         * @var string
         */
        public readonly string $revisionId,
        /** @var RevisionKind Semantic kind of this revision. */
        public readonly RevisionKind $kind,
        /** @var RevisionDelta The content changes that make up this revision. */
        public readonly RevisionDelta $delta,
        /** @var ?string Display name of the author who made this change, when available. */
        public readonly ?string $author = null,
        /**
         * ISO-8601 timestamp of the change, when available.
         *
         * Stored as a plain string so this type remains FFI-friendly and
         * unconditionally available without the `chrono` optional dep.
         * DOCX populates this from the `w:date` attribute (e.g.
         * `"2024-03-15T10:30:00Z"`).
         *
         * @var ?string
         */
        public readonly ?string $timestamp = null,
        /**
         * Best-effort document location for this revision.
         *
         * Resolution is format-dependent and may be `None` when the location
         * cannot be determined (e.g. changes inside table cells before
         * table-cell anchor support is added).
         *
         * @var ?RevisionAnchor
         */
        public readonly ?RevisionAnchor $anchor = null
    ) { }
}

/**
 * The content changes that make up a single revision.
 *
 * For insertions and deletions the `content` field carries the added/removed
 * lines as `DiffLine.Added` / `DiffLine.Removed` entries. For format
 * changes, `content` is empty — the property diff is left as a TODO for a
 * later enrichment pass.
 */
final class RevisionDelta
{
    public function __construct(
        /** @var array<DiffLine> Line-level content changes for this revision. */
        public readonly array $content,
        /** @var array<CellChange> Cell-level table changes for this revision. */
        public readonly array $tableChanges
    ) { }
}

/**
 * Extracted table structure.
 *
 * Represents a table detected and extracted from a document (PDF, image, etc.).
 * Tables are converted to both structured cell data and Markdown format.
 */
final class Table
{
    public function __construct(
        /** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
        public readonly array $cells,
        /** @var string Markdown representation of the table */
        public readonly string $markdown,
        /** @var int Page number where the table was found (1-indexed) */
        public readonly int $pageNumber,
        /**
         * Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
         * Only populated for PDF-extracted tables when position data is available.
         *
         * @var ?BoundingBox
         */
        public readonly ?BoundingBox $boundingBox = null
    ) { }
}

/**
 * Individual table cell with content and optional styling.
 *
 * Future extension point for rich table support with cell-level metadata.
 */
final class TableCell
{
    public function __construct(
        /** @var string Cell content as text */
        public readonly string $content,
        /** @var int Row span (number of rows this cell spans) */
        public readonly int $rowSpan,
        /** @var int Column span (number of columns this cell spans) */
        public readonly int $colSpan,
        /** @var bool Whether this is a header cell */
        public readonly bool $isHeader
    ) { }
}

/**
 * A URI extracted from a document.
 *
 * Represents any link, reference, or resource pointer found during extraction.
 * The `kind` field classifies the URI semantically, while `label` carries
 * optional human-readable display text.
 */
final class ExtractedUri
{
    public function __construct(
        /** @var string The URL or path string. */
        public readonly string $url,
        /** @var UriKind Semantic classification of the URI. */
        public readonly UriKind $kind,
        /** @var ?string Optional display text / label for the link. */
        public readonly ?string $label = null,
        /** @var ?int Optional page number where the URI was found (1-indexed). */
        public readonly ?int $page = null
    ) { }
}

/**
 * MIME type detection response.
 */
final class DetectResponse
{
    public function __construct(
        /** @var string Detected MIME type */
        public readonly string $mimeType,
        /** @var ?string Original filename (if provided) */
        public readonly ?string $filename = null
    ) { }
}

/**
 * Options controlling how two `ExtractionResult` values are compared.
 */
final class DiffOptions
{
    public function __construct(
        /** @var bool Include metadata changes in the diff. Default: `true`. */
        public readonly bool $includeMetadata,
        /** @var bool Include embedded-children changes in the diff. Default: `true`. */
        public readonly bool $includeEmbedded,
        /**
         * Truncate content to this many characters before diffing.
         *
         * Useful for very large documents where only the first N characters matter.
         * `None` means no truncation.
         *
         * @var ?int
         */
        public readonly ?int $maxContentChars = null
    ) { }
    public static function default(): DiffOptions
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * The complete diff between two `ExtractionResult` values.
 */
final class ExtractionDiff
{
    public function __construct(
        /**
         * Unified-diff hunks for the `content` field.
         *
         * Empty when the content is identical.
         *
         * @var array<DiffHunk>
         */
        public readonly array $contentDiff,
        /** @var array<Table> Tables present in `b` but not in `a` (by index position, excess right-side tables). */
        public readonly array $tablesAdded,
        /** @var array<Table> Tables present in `a` but not in `b` (by index position, excess left-side tables). */
        public readonly array $tablesRemoved,
        /** @var array<TableDiff> Cell-level changes for table pairs that share the same index and dimensions. */
        public readonly array $tablesChanged,
        /**
         * Metadata difference, encoded as a JSON object with three top-level keys:
         * `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
         * but not `b`), and `changed` (keys whose values differ — each entry is
         * `{ "from": <value-in-a>, "to": <value-in-b> }`).
         *
         * This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
         * to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
         * (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
         * preferred json-patch impl directly.
         *
         * @var string
         */
        public readonly string $metadataChanged,
        /** @var EmbeddedChanges Changes to embedded archive children. */
        public readonly EmbeddedChanges $embeddedChanges
    ) { }
}

/**
 * A single contiguous hunk in a unified diff.
 */
final class DiffHunk
{
    public function __construct(
        /** @var int Starting line number in the old content (0-indexed). */
        public readonly int $fromLine,
        /** @var int Number of lines from the old content in this hunk. */
        public readonly int $fromCount,
        /** @var int Starting line number in the new content (0-indexed). */
        public readonly int $toLine,
        /** @var int Number of lines from the new content in this hunk. */
        public readonly int $toCount,
        /** @var array<DiffLine> Lines that make up this hunk. */
        public readonly array $lines
    ) { }
}

/**
 * Cell-level changes for a pair of tables that share the same index.
 */
final class TableDiff
{
    public function __construct(
        /** @var int Zero-based index of the table in both `a.tables` and `b.tables`. */
        public readonly int $fromIndex,
        /** @var int Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
        public readonly int $toIndex,
        /** @var array<CellChange> Cell-level changes within the table. */
        public readonly array $cellChanges
    ) { }
}

/**
 * Changes to embedded archive children between two results.
 */
final class EmbeddedChanges
{
    public function __construct(
        /** @var array<ArchiveEntry> Children present in `b` but not in `a` (matched by `path`). */
        public readonly array $added,
        /** @var array<ArchiveEntry> Children present in `a` but not in `b` (matched by `path`). */
        public readonly array $removed,
        /**
         * Children present in both but with differing content (matched by `path`).
         *
         * Each entry holds the diff of the nested `ExtractionResult`.
         *
         * @var array<EmbeddedDiff>
         */
        public readonly array $changed
    ) { }
}

/**
 * Diff for a single embedded archive entry that appears in both results.
 */
final class EmbeddedDiff
{
    public function __construct(
        /** @var string Archive-relative path identifying this entry. */
        public readonly string $path,
        /** @var ExtractionDiff The recursive diff of the entry's extraction result. */
        public readonly ExtractionDiff $diff
    ) { }
}

/**
 * Preset configurations for common RAG use cases.
 *
 * Each preset combines chunk size, overlap, and embedding model
 * to provide an optimized configuration for specific scenarios.
 *
 * All string fields are owned `String` for FFI compatibility — instances
 * are safe to clone and pass across language boundaries.
 */
final class EmbeddingPreset
{
    public function __construct(
        /** @var string */
        public readonly string $name,
        /** @var int */
        public readonly int $chunkSize,
        /** @var int */
        public readonly int $overlap,
        /** @var string HuggingFace repository name for the model. */
        public readonly string $modelRepo,
        /** @var string Pooling strategy: "cls" or "mean". */
        public readonly string $pooling,
        /** @var string Path to the ONNX model file within the repo. */
        public readonly string $modelFile,
        /** @var int */
        public readonly int $dimensions,
        /** @var string */
        public readonly string $description
    ) { }
}

/**
 * YAKE-specific parameters.
 */
final class YakeParams
{
    public function __construct(
        /**
         * Window size for co-occurrence analysis (default: 2).
         *
         * Controls the context window for computing co-occurrence statistics.
         *
         * @var int
         */
        public readonly int $windowSize
    ) { }
    public static function default(): YakeParams
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * RAKE-specific parameters.
 */
final class RakeParams
{
    public function __construct(
        /** @var int Minimum word length to consider (default: 1). */
        public readonly int $minWordLength,
        /** @var int Maximum words in a keyword phrase (default: 3). */
        public readonly int $maxWordsPerPhrase
    ) { }
    public static function default(): RakeParams
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Keyword extraction configuration.
 */
final class KeywordConfig
{
    public function __construct(
        /** @var KeywordAlgorithm Algorithm to use for extraction. */
        public readonly KeywordAlgorithm $algorithm,
        /** @var int Maximum number of keywords to extract (default: 10). */
        public readonly int $maxKeywords,
        /**
         * Minimum score threshold (0.0-1.0, default: 0.0).
         *
         * Keywords with scores below this threshold are filtered out.
         * Note: Score ranges differ between algorithms.
         *
         * @var float
         */
        public readonly float $minScore,
        /**
         * N-gram range for keyword extraction (min, max).
         *
         * (1, 1) = unigrams only
         * (1, 2) = unigrams and bigrams
         * (1, 3) = unigrams, bigrams, and trigrams (default)
         *
         * @var array<int>
         */
        public readonly array $ngramRange,
        /**
         * Language code for stopword filtering (e.g., "en", "de", "fr").
         *
         * If None, no stopword filtering is applied.
         *
         * @var ?string
         */
        public readonly ?string $language = null,
        /** @var ?YakeParams YAKE-specific tuning parameters. */
        public readonly ?YakeParams $yakeParams = null,
        /** @var ?RakeParams RAKE-specific tuning parameters. */
        public readonly ?RakeParams $rakeParams = null
    ) { }
    public static function default(): KeywordConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Extracted keyword with metadata.
 */
final class Keyword
{
    public function __construct(
        /** @var string The keyword text. */
        public readonly string $text,
        /** @var float Relevance score (higher is better, algorithm-specific range). */
        public readonly float $score,
        /** @var KeywordAlgorithm Algorithm that extracted this keyword. */
        public readonly KeywordAlgorithm $algorithm,
        /** @var ?array<int> Optional positions where keyword appears in text (character offsets). */
        public readonly ?array $positions = null
    ) { }
}

/**
 * Configuration for PaddleOCR backend.
 *
 * Configures PaddleOCR text detection and recognition with multi-language support.
 * Uses a builder pattern for convenient configuration.
 *
 * # Examples
 */
final class PaddleOcrConfig
{
    public function __construct(
        /** @var string Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
        public readonly string $language,
        /**
         * Enable angle classification for rotated text (default: false).
         * Can misfire on short text regions, rotating crops incorrectly before recognition.
         *
         * @var bool
         */
        public readonly bool $useAngleCls,
        /** @var bool Enable table structure detection (default: false) */
        public readonly bool $enableTableDetection,
        /**
         * Database threshold for text detection (default: 0.3)
         * Range: 0.0-1.0, higher values require more confident detections
         *
         * @var float
         */
        public readonly float $detDbThresh,
        /**
         * Box threshold for text bounding box refinement (default: 0.5)
         * Range: 0.0-1.0
         *
         * @var float
         */
        public readonly float $detDbBoxThresh,
        /**
         * Unclip ratio for expanding text bounding boxes (default: 1.6)
         * Controls the expansion of detected text regions
         *
         * @var float
         */
        public readonly float $detDbUnclipRatio,
        /**
         * Maximum side length for detection image (default: 960)
         * Larger images may be resized to this limit for faster inference
         *
         * @var int
         */
        public readonly int $detLimitSideLen,
        /**
         * Batch size for recognition inference (default: 6)
         * Number of text regions to process simultaneously
         *
         * @var int
         */
        public readonly int $recBatchNum,
        /**
         * Padding in pixels added around the image before detection (default: 10).
         * Large values can include surrounding content like table gridlines.
         *
         * @var int
         */
        public readonly int $padding,
        /**
         * Minimum recognition confidence score for text lines (default: 0.5).
         * Text regions with recognition confidence below this threshold are discarded.
         * Matches PaddleOCR Python's `drop_score` parameter.
         * Range: 0.0-1.0
         *
         * @var float
         */
        public readonly float $dropScore,
        /**
         * Model tier controlling detection/recognition model size and accuracy trade-off.
         * - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
         * - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
         *
         * @var string
         */
        public readonly string $modelTier,
        /** @var ?string Optional custom cache directory for model files */
        public readonly ?string $cacheDir = null
    ) { }
    public function withCacheDir(string $path): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withTableDetection(bool $enable): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withAngleCls(bool $enable): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withDetDbThresh(float $threshold): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withDetDbBoxThresh(float $threshold): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withDetDbUnclipRatio(float $ratio): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withDetLimitSideLen(int $length): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withRecBatchNum(int $batch_size): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withDropScore(float $score): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withPadding(int $padding): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public function withModelTier(string $tier): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
    public static function default(): PaddleOcrConfig
    { throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}

/**
 * Combined paths to all models needed for OCR (backward compatibility).
 */
final class ModelPaths
{
    public function __construct(
        /** @var string Path to the detection model directory. */
        public readonly string $detModel,
        /** @var string Path to the classification model directory. */
        public readonly string $clsModel,
        /** @var string Path to the recognition model directory. */
        public readonly string $recModel,
        /** @var string Path to the character dictionary file. */
        public readonly string $dictFile
    ) { }
}

/**
 * Document orientation detection result.
 */
final class OrientationResult
{
    public function __construct(
        /** @var int Detected orientation in degrees (0, 90, 180, or 270). */
        public readonly int $degrees,
        /** @var float Confidence score (0.0-1.0). */
        public readonly float $confidence
    ) { }
}

/**
 * Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
 */
final class BBox
{
    public function __construct(
        /** @var float */
        public readonly float $x1,
        /** @var float */
        public readonly float $y1,
        /** @var float */
        public readonly float $x2,
        /** @var float */
        public readonly float $y2
    ) { }
}

/**
 * A single layout detection result.
 */
final class LayoutDetection
{
    public function __construct(
        /** @var LayoutClass */
        public readonly LayoutClass $className,
        /** @var float */
        public readonly float $confidence,
        /** @var BBox */
        public readonly BBox $bbox
    ) { }
}

/**
 * Pre-computed table markdown for a table detection region.
 *
 * Produced by the TATR-based table structure recognizer and surfaced as part of
 * layout-aware OCR results.  The struct lives here (under `layout-types`, pure-Rust)
 * so that consumers who do not enable `layout-detection` (ORT) can still reference
 * the type in their own code.
 */
final class RecognizedTable
{
    public function __construct(
        /** @var BBox Detection bbox that this table corresponds to (for matching). */
        public readonly BBox $detectionBbox,
        /** @var array<array<string>> Table cells as a 2D vector (rows × columns). */
        public readonly array $cells,
        /** @var string Rendered markdown table. */
        public readonly string $markdown
    ) { }
}

/**
 * Page-level detection result containing all detections and page metadata.
 */
final class DetectionResult
{
    public function __construct(
        /** @var int */
        public readonly int $pageWidth,
        /** @var int */
        public readonly int $pageHeight,
        /** @var array<LayoutDetection> */
        public readonly array $detections
    ) { }
}

/**
 * Embedded file descriptor extracted from the PDF name tree.
 */
final class EmbeddedFile
{
    public function __construct(
        /** @var string The filename as stored in the PDF name tree. */
        public readonly string $name,
        /** @var string Raw file bytes from the embedded stream (already decompressed by lopdf). */
        public readonly string $data,
        /**
         * Compressed byte count of the original stream (before decompression).
         *
         * Used by callers to compute the decompression ratio and detect zip-bomb-style
         * attacks that embed a tiny compressed stream expanding to gigabytes of data.
         *
         * @var int
         */
        public readonly int $compressedSize,
        /** @var ?string MIME type if specified in the filespec, otherwise `None`. */
        public readonly ?string $mimeType = null
    ) { }
}

/**
 * PDF-specific metadata.
 *
 * Contains metadata fields specific to PDF documents that are not in the common
 * `Metadata` structure. Common fields like title, authors, keywords, and dates
 * are at the `Metadata` level.
 */
final class PdfMetadata
{
    public function __construct(
        /** @var ?string PDF version (e.g., "1.7", "2.0") */
        public readonly ?string $pdfVersion = null,
        /** @var ?string PDF producer (application that created the PDF) */
        public readonly ?string $producer = null,
        /** @var ?bool Whether the PDF is encrypted/password-protected */
        public readonly ?bool $isEncrypted = null,
        /** @var ?int First page width in points (1/72 inch) */
        public readonly ?int $width = null,
        /** @var ?int First page height in points (1/72 inch) */
        public readonly ?int $height = null,
        /** @var ?int Total number of pages in the PDF document */
        public readonly ?int $pageCount = null
    ) { }
}

enum ExecutionProviderType: string
{
    case Auto = 'Auto';
    case Cpu = 'Cpu';
    case CoreMl = 'CoreMl';
    case Cuda = 'Cuda';
    case TensorRt = 'TensorRt';
}

enum OutputFormat: string
{
    case Plain = 'Plain';
    case Markdown = 'Markdown';
    case Djot = 'Djot';
    case Html = 'Html';
    case Json = 'Json';
    case Structured = 'Structured';
    case Custom = 'Custom';
}

enum HtmlTheme: string
{
    case Default = 'Default';
    case GitHub = 'GitHub';
    case Dark = 'Dark';
    case Light = 'Light';
    case Unstyled = 'Unstyled';
}

enum TableModel: string
{
    case Tatr = 'tatr';
    case SlanetWired = 'slanet_wired';
    case SlanetWireless = 'slanet_wireless';
    case SlanetPlus = 'slanet_plus';
    case SlanetAuto = 'slanet_auto';
    case Disabled = 'disabled';
}

enum ChunkerType: string
{
    case Text = 'Text';
    case Markdown = 'Markdown';
    case Yaml = 'Yaml';
    case Semantic = 'Semantic';
}

/**
 * How chunk size is measured.
 *
 * Defaults to `Characters` (Unicode character count). When using token-based sizing,
 * chunks are sized by token count according to the specified tokenizer.
 *
 * Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
 * available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
 * (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
 */
final class ChunkSizing
{
}

/**
 * Embedding model types supported by Kreuzberg.
 */
final class EmbeddingModelType
{
}

enum CodeContentMode: string
{
    case Chunks = 'chunks';
    case Raw = 'raw';
    case Structure = 'structure';
}

enum ListType: string
{
    case Bullet = 'Bullet';
    case Numbered = 'Numbered';
    case Lettered = 'Lettered';
    case Indented = 'Indented';
}

enum OcrBackendType: string
{
    case Tesseract = 'Tesseract';
    case EasyOCR = 'EasyOCR';
    case PaddleOCR = 'PaddleOCR';
    case Custom = 'Custom';
}

enum ProcessingStage: string
{
    case Early = 'Early';
    case Middle = 'Middle';
    case Late = 'Late';
}

enum ReductionLevel: string
{
    case Off = 'Off';
    case Light = 'Light';
    case Moderate = 'Moderate';
    case Aggressive = 'Aggressive';
    case Maximum = 'Maximum';
}

enum PdfAnnotationType: string
{
    case Text = 'text';
    case Highlight = 'highlight';
    case Link = 'link';
    case Stamp = 'stamp';
    case Underline = 'underline';
    case StrikeOut = 'strike_out';
    case Other = 'other';
}

enum BlockType: string
{
    case Paragraph = 'paragraph';
    case Heading = 'heading';
    case Blockquote = 'blockquote';
    case CodeBlock = 'code_block';
    case ListItem = 'list_item';
    case OrderedList = 'ordered_list';
    case BulletList = 'bullet_list';
    case TaskList = 'task_list';
    case DefinitionList = 'definition_list';
    case DefinitionTerm = 'definition_term';
    case DefinitionDescription = 'definition_description';
    case Div = 'div';
    case Section = 'section';
    case ThematicBreak = 'thematic_break';
    case RawBlock = 'raw_block';
    case MathDisplay = 'math_display';
}

enum InlineType: string
{
    case Text = 'text';
    case Strong = 'strong';
    case Emphasis = 'emphasis';
    case Highlight = 'highlight';
    case Subscript = 'subscript';
    case Superscript = 'superscript';
    case Insert = 'insert';
    case Delete = 'delete';
    case Code = 'code';
    case Link = 'link';
    case Image = 'image';
    case Span = 'span';
    case Math = 'math';
    case RawInline = 'raw_inline';
    case FootnoteRef = 'footnote_ref';
    case Symbol = 'symbol';
}

enum RelationshipKind: string
{
    case FootnoteReference = 'footnote_reference';
    case CitationReference = 'citation_reference';
    case InternalLink = 'internal_link';
    case Caption = 'caption';
    case Label = 'label';
    case TocEntry = 'toc_entry';
    case CrossReference = 'cross_reference';
}

enum ContentLayer: string
{
    case Body = 'body';
    case Header = 'header';
    case Footer = 'footer';
    case Footnote = 'footnote';
}

/**
 * Tagged enum for node content. Each variant carries only type-specific data.
 *
 * Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
 * Go/Java/TypeScript bindings.
 */
final class NodeContent
{
}

/**
 * Types of inline text annotations.
 */
final class AnnotationKind
{
}

enum ExtractionMethod: string
{
    case Native = 'native';
    case Ocr = 'ocr';
    case Mixed = 'mixed';
}

enum ChunkType: string
{
    case Heading = 'heading';
    case PartyList = 'party_list';
    case Definitions = 'definitions';
    case OperativeClause = 'operative_clause';
    case SignatureBlock = 'signature_block';
    case Schedule = 'schedule';
    case TableLike = 'table_like';
    case Formula = 'formula';
    case CodeBlock = 'code_block';
    case Image = 'image';
    case OrgChart = 'org_chart';
    case Diagram = 'diagram';
    case Unknown = 'unknown';
}

enum ImageKind: string
{
    case Photograph = 'photograph';
    case Diagram = 'diagram';
    case Chart = 'chart';
    case Drawing = 'drawing';
    case TextBlock = 'text_block';
    case Decoration = 'decoration';
    case Logo = 'logo';
    case Icon = 'icon';
    case TileFragment = 'tile_fragment';
    case Mask = 'mask';
    case PageRaster = 'page_raster';
    case Unknown = 'unknown';
}

enum ResultFormat: string
{
    case Unified = 'unified';
    case ElementBased = 'element_based';
}

enum ElementType: string
{
    case Title = 'title';
    case NarrativeText = 'narrative_text';
    case Heading = 'heading';
    case ListItem = 'list_item';
    case Table = 'table';
    case Image = 'image';
    case PageBreak = 'page_break';
    case CodeBlock = 'code_block';
    case BlockQuote = 'block_quote';
    case Footer = 'footer';
    case Header = 'header';
}

/**
 * Format-specific metadata (discriminated union).
 *
 * Only one format type can exist per extraction result. This provides
 * type-safe, clean metadata without nested optionals.
 */
final class FormatMetadata
{
}

enum TextDirection: string
{
    case LeftToRight = 'ltr';
    case RightToLeft = 'rtl';
    case Auto = 'auto';
}

enum LinkType: string
{
    case Anchor = 'Anchor';
    case Internal = 'Internal';
    case External = 'External';
    case Email = 'Email';
    case Phone = 'Phone';
    case Other = 'Other';
}

enum ImageType: string
{
    case DataUri = 'data-uri';
    case InlineSvg = 'inline-svg';
    case External = 'External';
    case Relative = 'Relative';
}

enum StructuredDataType: string
{
    case JsonLd = 'json-ld';
    case Microdata = 'Microdata';
    case RDFa = 'rdfa';
}

/**
 * Bounding geometry for an OCR element.
 *
 * Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
 * (from PaddleOCR and rotated text detection).
 */
final class OcrBoundingGeometry
{
}

enum OcrElementLevel: string
{
    case Word = 'word';
    case Line = 'line';
    case Block = 'block';
    case Page = 'page';
}

enum PageUnitType: string
{
    case Page = 'page';
    case Slide = 'slide';
    case Sheet = 'sheet';
}

/**
 * A single line in a unified-diff hunk.
 *
 * Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
 * reference it unconditionally, without requiring the `diff` Cargo feature.
 * `crate.diff` re-exports this type verbatim.
 */
final class DiffLine
{
}

enum RevisionKind: string
{
    case Insertion = 'insertion';
    case Deletion = 'deletion';
    case FormatChange = 'format_change';
    case Comment = 'comment';
}

/**
 * Best-effort document location for a revision.
 */
final class RevisionAnchor
{
}

enum UriKind: string
{
    case Hyperlink = 'hyperlink';
    case Image = 'image';
    case Anchor = 'anchor';
    case Citation = 'citation';
    case Reference = 'reference';
    case Email = 'email';
}

enum KeywordAlgorithm: string
{
    case Yake = 'Yake';
    case Rake = 'Rake';
}

enum PSMMode: string
{
    case OsdOnly = 'OsdOnly';
    case AutoOsd = 'AutoOsd';
    case AutoOnly = 'AutoOnly';
    case Auto = 'Auto';
    case SingleColumn = 'SingleColumn';
    case SingleBlockVertical = 'SingleBlockVertical';
    case SingleBlock = 'SingleBlock';
    case SingleLine = 'SingleLine';
    case SingleWord = 'SingleWord';
    case CircleWord = 'CircleWord';
    case SingleChar = 'SingleChar';
}

enum PaddleLanguage: string
{
    case English = 'English';
    case Chinese = 'Chinese';
    case Japanese = 'Japanese';
    case Korean = 'Korean';
    case German = 'German';
    case French = 'French';
    case Latin = 'Latin';
    case Cyrillic = 'Cyrillic';
    case TraditionalChinese = 'TraditionalChinese';
    case Thai = 'Thai';
    case Greek = 'Greek';
    case EastSlavic = 'EastSlavic';
    case Arabic = 'Arabic';
    case Devanagari = 'Devanagari';
    case Tamil = 'Tamil';
    case Telugu = 'Telugu';
}

enum LayoutClass: string
{
    case Caption = 'caption';
    case Footnote = 'footnote';
    case Formula = 'formula';
    case ListItem = 'list_item';
    case PageFooter = 'page_footer';
    case PageHeader = 'page_header';
    case Picture = 'picture';
    case SectionHeader = 'section_header';
    case Table = 'table';
    case Text = 'text';
    case Title = 'title';
    case DocumentIndex = 'document_index';
    case Code = 'code';
    case CheckboxSelected = 'checkbox_selected';
    case CheckboxUnselected = 'checkbox_unselected';
    case Form = 'form';
    case KeyValueRegion = 'key_value_region';
}

class KreuzbergApi
{
    public static function extractBytes(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
    public static function extractFile(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
    public static function extractFileSync(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
    public static function extractBytesSync(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<\Kreuzberg\BatchFileItem> $items
     * @param \Kreuzberg\ExtractionConfig $config
     * @return array<\Kreuzberg\ExtractionResult>
     */
    public static function batchExtractFilesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<\Kreuzberg\BatchBytesItem> $items
     * @param \Kreuzberg\ExtractionConfig $config
     * @return array<\Kreuzberg\ExtractionResult>
     */
    public static function batchExtractBytesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<\Kreuzberg\BatchFileItem> $items
     * @param \Kreuzberg\ExtractionConfig $config
     * @return array<\Kreuzberg\ExtractionResult>
     */
    public static function batchExtractFiles(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<\Kreuzberg\BatchBytesItem> $items
     * @param \Kreuzberg\ExtractionConfig $config
     * @return array<\Kreuzberg\ExtractionResult>
     */
    public static function batchExtractBytes(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    public static function detectMimeTypeFromBytes(string $content): string { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param string $mime_type
     * @return array<string>
     */
    public static function getExtensionsForMime(string $mime_type): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearEmbeddingBackends(): void { }
    /**
     * @return array<string>
     */
    public static function listEmbeddingBackends(): array { throw new \RuntimeException('Not implemented.'); }
    /**
     * @return array<string>
     */
    public static function listDocumentExtractors(): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearDocumentExtractors(): void { }
    /**
     * @return array<string>
     */
    public static function listOcrBackends(): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearOcrBackends(): void { }
    /**
     * @return array<string>
     */
    public static function listPostProcessors(): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearPostProcessors(): void { }
    /**
     * @return array<string>
     */
    public static function listRenderers(): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearRenderers(): void { }
    /**
     * @return array<string>
     */
    public static function listValidators(): array { throw new \RuntimeException('Not implemented.'); }
    public static function clearValidators(): void { }
    public static function compare(\Kreuzberg\ExtractionResult $a, \Kreuzberg\ExtractionResult $b, \Kreuzberg\DiffOptions $opts): \Kreuzberg\ExtractionDiff { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<string> $texts
     * @param \Kreuzberg\EmbeddingConfig $config
     * @return array<array<float>>
     */
    public static function embedTextsAsync(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    public static function renderPdfPageToPng(string $pdf_bytes, int $page_index, ?int $dpi = null, ?string $password = null): string { throw new \RuntimeException('Not implemented.'); }
    public static function detectMimeType(string $path, bool $check_exists): string { throw new \RuntimeException('Not implemented.'); }
    /**
     * @param array<string> $texts
     * @param \Kreuzberg\EmbeddingConfig $config
     * @return array<array<float>>
     */
    public static function embedTexts(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
    public static function getEmbeddingPreset(string $name): ?\Kreuzberg\EmbeddingPreset { throw new \RuntimeException('Not implemented.'); }
    /**
     * @return array<string>
     */
    public static function listEmbeddingPresets(): array { throw new \RuntimeException('Not implemented.'); }
    public static function registerOcrBackend(\Kreuzberg\OcrBackend $backend): void { }
    public static function unregisterOcrBackend(string $name): void { }
    public static function clearOcrBackends(): void { }
    public static function registerPostProcessor(\Kreuzberg\PostProcessor $backend): void { }
    public static function unregisterPostProcessor(string $name): void { }
    public static function clearPostProcessors(): void { }
    public static function registerValidator(\Kreuzberg\Validator $backend): void { }
    public static function unregisterValidator(string $name): void { }
    public static function clearValidators(): void { }
    public static function registerEmbeddingBackend(\Kreuzberg\EmbeddingBackend $backend): void { }
    public static function unregisterEmbeddingBackend(string $name): void { }
    public static function clearEmbeddingBackends(): void { }
    public static function registerDocumentExtractor(\Kreuzberg\DocumentExtractor $backend): void { }
    public static function unregisterDocumentExtractor(string $name): void { }
    public static function clearDocumentExtractors(): void { }
    public static function registerRenderer(\Kreuzberg\Renderer $backend): void { }
    public static function unregisterRenderer(string $name): void { }
    public static function clearRenderers(): void { }
}

} // end namespace