Files
fil/packages/php/stubs/kreuzberg_extension.php

5397 lines
199 KiB
PHP
Raw Normal View History

2026-06-01 23:40:55 +02:00
<?php
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
// Type stubs for the native PHP extension — declares classes
// provided at runtime by the compiled Rust extension (.so/.dll).
// Include this in phpstan.neon scanFiles for static analysis.
declare(strict_types=1);
namespace Kreuzberg {
class KreuzbergException extends \RuntimeException
{
public function getErrorCode(): int { throw new \RuntimeException('Not implemented.'); }
}
final class CacheStats
{
public function __construct(
/** @var int */
public readonly int $totalFiles,
/** @var float */
public readonly float $totalSizeMb,
/** @var float */
public readonly float $availableSpaceMb,
/** @var float */
public readonly float $oldestFileAgeDays,
/** @var float */
public readonly float $newestFileAgeDays
) { }
}
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*
* # Example
*/
final class AccelerationConfig
{
public function __construct(
/** @var ExecutionProviderType Execution provider to use for ONNX inference. */
public readonly ExecutionProviderType $provider,
/** @var int GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
public readonly int $deviceId
) { }
}
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `None` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
final class ContentFilterConfig
{
public function __construct(
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*
* @var bool
*/
public readonly bool $includeHeaders,
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*
* @var bool
*/
public readonly bool $includeFooters,
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*
* @var bool
*/
public readonly bool $stripRepeatingText,
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*
* @var bool
*/
public readonly bool $includeWatermarks
) { }
public static function default(): ContentFilterConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for email extraction.
*/
final class EmailConfig
{
public function __construct(
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `None`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*
* @var ?int
*/
public readonly ?int $msgFallbackCodepage = null
) { }
}
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*
* # Example
*/
final class ExtractionConfig
{
public function __construct(
/** @var bool Enable caching of extraction results */
public readonly bool $useCache,
/** @var bool Enable quality post-processing */
public readonly bool $enableQualityProcessing,
/** @var bool Force OCR even for searchable PDFs */
public readonly bool $forceOcr,
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*
* @var bool
*/
public readonly bool $disableOcr,
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*
* @var ResultFormat
*/
public readonly ResultFormat $resultFormat,
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*
* @var OutputFormat
*/
public readonly OutputFormat $outputFormat,
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*
* @var bool
*/
public readonly bool $useLayoutForMarkdown,
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` can be combined with Unified or ElementBased.
*
* @var bool
*/
public readonly bool $includeDocumentStructure,
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*
* @var int
*/
public readonly int $maxArchiveDepth,
/** @var ?OcrConfig OCR configuration (None = OCR disabled) */
public readonly ?OcrConfig $ocr = null,
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*
* @var ?array<int>
*/
public readonly ?array $forceOcrPages = null,
/** @var ?ChunkingConfig Text chunking configuration (None = chunking disabled) */
public readonly ?ChunkingConfig $chunking = null,
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See [`ContentFilterConfig`] for per-field documentation.
*
* @var ?ContentFilterConfig
*/
public readonly ?ContentFilterConfig $contentFilter = null,
/** @var ?ImageExtractionConfig Image extraction configuration (None = no image extraction) */
public readonly ?ImageExtractionConfig $images = null,
/** @var ?PdfConfig PDF-specific options (None = use defaults) */
public readonly ?PdfConfig $pdfOptions = null,
/** @var ?TokenReductionOptions Token reduction configuration (None = no token reduction) */
public readonly ?TokenReductionOptions $tokenReduction = null,
/** @var ?LanguageDetectionConfig Language detection configuration (None = no language detection) */
public readonly ?LanguageDetectionConfig $languageDetection = null,
/** @var ?PageConfig Page extraction configuration (None = no page tracking) */
public readonly ?PageConfig $pages = null,
/** @var ?KeywordConfig Keyword extraction configuration (None = no keyword extraction) */
public readonly ?KeywordConfig $keywords = null,
/** @var ?PostProcessorConfig Post-processor configuration (None = use defaults) */
public readonly ?PostProcessorConfig $postprocessor = null,
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*
* @var ?string
*/
public readonly ?string $htmlOptions = null,
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat::Html`, the extraction
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `None`, the existing plain comrak-based HTML renderer is used.
*
* @var ?HtmlOutputConfig
*/
public readonly ?HtmlOutputConfig $htmlOutput = null,
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by [`FileExtractionConfig::timeout_secs`].
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `None` to
* disable the timeout for trusted input or long-running workloads.
*
* @var ?int
*/
public readonly ?int $extractionTimeoutSecs = null,
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*
* @var ?int
*/
public readonly ?int $maxConcurrentExtractions = null,
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `None`, default limits are used.
*
* @var ?SecurityLimits
*/
public readonly ?SecurityLimits $securityLimits = null,
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `None` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*
* @var ?int
*/
public readonly ?int $maxEmbeddedFileBytes = null,
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*
* @var ?LayoutDetectionConfig
*/
public readonly ?LayoutDetectionConfig $layout = null,
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*
* @var ?string
*/
public readonly ?string $cacheNamespace = null,
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `None`, the global TTL applies.
*
* @var ?int
*/
public readonly ?int $cacheTtlSecs = null,
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*
* @var ?EmailConfig
*/
public readonly ?EmailConfig $email = null,
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*
* @var ?string
*/
public readonly ?string $concurrency = null,
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*
* @var ?TreeSitterConfig
*/
public readonly ?TreeSitterConfig $treeSitter = null,
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult::structured_output`.
*
* @var ?StructuredExtractionConfig
*/
public readonly ?StructuredExtractionConfig $structuredExtraction = null,
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*
* @var ?string
*/
public readonly ?string $cancelToken = null
) { }
public static function default(): ExtractionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function needsImageProcessing(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` `None` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
* - `max_concurrent_extractions` controls batch parallelism
* - `use_cache` global caching policy
* - `acceleration` shared ONNX execution provider
* - `security_limits` global archive security policy
*
* # Example
*/
final class FileExtractionConfig
{
public function __construct(
/** @var ?bool Override quality post-processing for this file. */
public readonly ?bool $enableQualityProcessing = null,
/** @var ?OcrConfig Override OCR configuration for this file (None in the Option = use batch default). */
public readonly ?OcrConfig $ocr = null,
/** @var ?bool Override force OCR for this file. */
public readonly ?bool $forceOcr = null,
/** @var ?array<int> Override force OCR pages for this file (1-indexed page numbers). */
public readonly ?array $forceOcrPages = null,
/** @var ?bool Override disable OCR for this file. */
public readonly ?bool $disableOcr = null,
/** @var ?ChunkingConfig Override chunking configuration for this file. */
public readonly ?ChunkingConfig $chunking = null,
/** @var ?ContentFilterConfig Override content filtering configuration for this file. */
public readonly ?ContentFilterConfig $contentFilter = null,
/** @var ?ImageExtractionConfig Override image extraction configuration for this file. */
public readonly ?ImageExtractionConfig $images = null,
/** @var ?PdfConfig Override PDF options for this file. */
public readonly ?PdfConfig $pdfOptions = null,
/** @var ?TokenReductionOptions Override token reduction for this file. */
public readonly ?TokenReductionOptions $tokenReduction = null,
/** @var ?LanguageDetectionConfig Override language detection for this file. */
public readonly ?LanguageDetectionConfig $languageDetection = null,
/** @var ?PageConfig Override page extraction for this file. */
public readonly ?PageConfig $pages = null,
/** @var ?KeywordConfig Override keyword extraction for this file. */
public readonly ?KeywordConfig $keywords = null,
/** @var ?PostProcessorConfig Override post-processor for this file. */
public readonly ?PostProcessorConfig $postprocessor = null,
/** @var ?string Override HTML conversion options for this file. */
public readonly ?string $htmlOptions = null,
/** @var ?ResultFormat Override result format for this file. */
public readonly ?ResultFormat $resultFormat = null,
/** @var ?OutputFormat Override output content format for this file. */
public readonly ?OutputFormat $outputFormat = null,
/** @var ?bool Override document structure output for this file. */
public readonly ?bool $includeDocumentStructure = null,
/** @var ?LayoutDetectionConfig Override layout detection for this file. */
public readonly ?LayoutDetectionConfig $layout = null,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*
* @var ?int
*/
public readonly ?int $timeoutSecs = null,
/** @var ?TreeSitterConfig Override tree-sitter configuration for this file. */
public readonly ?TreeSitterConfig $treeSitter = null,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*
* @var ?StructuredExtractionConfig
*/
public readonly ?StructuredExtractionConfig $structuredExtraction = null
) { }
}
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
final class BatchBytesItem
{
public function __construct(
/** @var string The content bytes to extract from */
public readonly string $content,
/** @var string MIME type of the content (e.g., "application/pdf", "text/html") */
public readonly string $mimeType,
/** @var ?FileExtractionConfig Per-item configuration overrides (None uses batch-level defaults) */
public readonly ?FileExtractionConfig $config = null
) { }
}
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
final class BatchFileItem
{
public function __construct(
/** @var string Path to the file to extract from */
public readonly string $path,
/** @var ?FileExtractionConfig Per-file configuration overrides (None uses batch-level defaults) */
public readonly ?FileExtractionConfig $config = null
) { }
}
/**
* Image extraction configuration.
*/
final class ImageExtractionConfig
{
public function __construct(
/** @var bool Extract images from documents */
public readonly bool $extractImages,
/** @var int Target DPI for image normalization */
public readonly int $targetDpi,
/** @var int Maximum dimension for images (width or height) */
public readonly int $maxImageDimension,
/**
* Whether to inject image reference placeholders into markdown output.
* When `true` (default), image references like `![Image 1](embedded:p1_i0)`
* are appended to the markdown. Set to `false` to extract images as data
* without polluting the markdown output.
*
* @var bool
*/
public readonly bool $injectPlaceholders,
/** @var bool Automatically adjust DPI based on image content */
public readonly bool $autoAdjustDpi,
/** @var int Minimum DPI threshold */
public readonly int $minDpi,
/** @var int Maximum DPI threshold */
public readonly int $maxDpi,
/**
* When `true` (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*
* @var bool
*/
public readonly bool $classify,
/**
* When `true`, full-page renders produced during OCR preprocessing are captured
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
*
* Defaults to `false`. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*
* @var bool
*/
public readonly bool $includePageRasters,
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
* are processed with the configured OCR backend. Set to `false` to extract images
* without OCR processing, even when OCR is enabled.
*
* @var bool
*/
public readonly bool $runOcrOnImages,
/**
* When `true`, image OCR results are rendered as plain text without the
* `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
* is also `true`.
*
* @var bool
*/
public readonly bool $ocrTextOnly,
/**
* When `true` and `ocr_text_only` is `false`, append the OCR text after
* the image placeholder in the rendered output.
*
* @var bool
*/
public readonly bool $appendOcrText,
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* `None` (default) means no limit all images are extracted.
*
* @var ?int
*/
public readonly ?int $maxImagesPerPage = null
) { }
public static function default(): ImageExtractionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Token reduction configuration.
*/
final class TokenReductionOptions
{
public function __construct(
/** @var string Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
public readonly string $mode,
/** @var bool Preserve important words (capitalized, technical terms) */
public readonly bool $preserveImportantWords
) { }
public static function default(): TokenReductionOptions
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Language detection configuration.
*/
final class LanguageDetectionConfig
{
public function __construct(
/** @var bool Enable language detection */
public readonly bool $enabled,
/** @var float Minimum confidence threshold (0.0-1.0) */
public readonly float $minConfidence,
/** @var bool Detect multiple languages in the document */
public readonly bool $detectMultiple
) { }
public static function default(): LanguageDetectionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for styled HTML output.
*
* When set on `ExtractionConfig.html_output` alongside
* `output_format = OutputFormat.Html`, the pipeline builds a
* `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
*
* # Example
*/
final class HtmlOutputConfig
{
public function __construct(
/** @var HtmlTheme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
public readonly HtmlTheme $theme,
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*
* @var string
*/
public readonly string $classPrefix,
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*
* @var bool
*/
public readonly bool $embedCss,
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*
* @var ?string
*/
public readonly ?string $css = null,
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*
* @var ?string
*/
public readonly ?string $cssFile = null
) { }
public static function default(): HtmlOutputConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Layout detection configuration.
*
* Controls layout detection behavior in the extraction pipeline.
* When set on `ExtractionConfig`(super.ExtractionConfig), layout detection
* is enabled for PDF extraction.
*/
final class LayoutDetectionConfig
{
public function __construct(
/** @var bool Whether to apply postprocessing heuristics (default: true). */
public readonly bool $applyHeuristics,
/**
* Table structure recognition model.
*
* Controls which model is used for table cell detection within layout-detected
* table regions. Defaults to [`TableModel::Tatr`].
*
* @var TableModel
*/
public readonly TableModel $tableModel,
/** @var ?float Confidence threshold override (None = use model default). */
public readonly ?float $confidenceThreshold = null,
/**
* Hardware acceleration for ONNX models (layout detection + table structure).
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null
) { }
public static function default(): LayoutDetectionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for an LLM provider/model via liter-llm.
*
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
* its own `LlmConfig`, allowing different providers per feature.
*
* # Example
*
* ```toml
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
* ```
*/
final class LlmConfig
{
public function __construct(
/**
* Provider/model string using liter-llm routing format.
*
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
* `"groq/llama-3.1-70b-versatile"`.
*
* @var string
*/
public readonly string $model,
/**
* API key for the provider. When `None`, liter-llm falls back to
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
*
* @var ?string
*/
public readonly ?string $apiKey = null,
/** @var ?string Custom base URL override for the provider endpoint. */
public readonly ?string $baseUrl = null,
/** @var ?int Request timeout in seconds (default: 60). */
public readonly ?int $timeoutSecs = null,
/** @var ?int Maximum retry attempts (default: 3). */
public readonly ?int $maxRetries = null,
/** @var ?float Sampling temperature for generation tasks. */
public readonly ?float $temperature = null,
/** @var ?int Maximum tokens to generate. */
public readonly ?int $maxTokens = null
) { }
}
/**
* Configuration for LLM-based structured data extraction.
*
* Sends extracted document content to a VLM with a JSON schema,
* returning structured data that conforms to the schema.
*
* # Example
*
* ```toml
* [structured_extraction]
* schema_name = "invoice_data"
* strict = true
*
* [structured_extraction.schema]
* type = "object"
* properties.vendor = { type = "string" }
* properties.total = { type = "number" }
* required = ["vendor", "total"]
*
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* ```
*/
final class StructuredExtractionConfig
{
public function __construct(
/** @var string JSON Schema defining the desired output structure. */
public readonly string $schema,
/** @var string Schema name passed to the LLM's structured output mode. */
public readonly string $schemaName,
/** @var bool Enable strict mode — output must exactly match the schema. */
public readonly bool $strict,
/** @var LlmConfig LLM configuration for the extraction. */
public readonly LlmConfig $llm,
/** @var ?string Optional schema description for the LLM. */
public readonly ?string $schemaDescription = null,
/**
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
*
* Available template variables:
* - `{{ content }}` The extracted document text.
* - `{{ schema }}` The JSON schema as a formatted string.
* - `{{ schema_name }}` The schema name.
* - `{{ schema_description }}` The schema description (may be empty).
*
* @var ?string
*/
public readonly ?string $prompt = null
) { }
}
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so `OcrQualityThresholds.default()` preserves existing semantics exactly.
*/
final class OcrQualityThresholds
{
public function __construct(
/** @var int Minimum total non-whitespace characters to consider text substantive. */
public readonly int $minTotalNonWhitespace,
/** @var float Minimum non-whitespace characters per page on average. */
public readonly float $minNonWhitespacePerPage,
/** @var int Minimum character count for a word to be "meaningful". */
public readonly int $minMeaningfulWordLen,
/** @var int Minimum count of meaningful words before text is accepted. */
public readonly int $minMeaningfulWords,
/** @var float Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
public readonly float $minAlnumRatio,
/** @var int Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
public readonly int $minGarbageChars,
/** @var float Maximum fraction of short (1-2 char) words before text is considered fragmented. */
public readonly float $maxFragmentedWordRatio,
/**
* Critical fragmentation threshold triggers OCR regardless of meaningful words.
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
*
* @var float
*/
public readonly float $criticalFragmentedWordRatio,
/** @var float Minimum average word length. Below this with enough words indicates garbled extraction. */
public readonly float $minAvgWordLength,
/** @var int Minimum word count before average word length check applies. */
public readonly int $minWordsForAvgLengthCheck,
/** @var float Minimum consecutive word repetition ratio to detect column scrambling. */
public readonly float $minConsecutiveRepeatRatio,
/** @var int Minimum word count before consecutive repetition check is applied. */
public readonly int $minWordsForRepeatCheck,
/** @var int Minimum character count for "substantive markdown" OCR skip gate. */
public readonly int $substantiveMinChars,
/** @var int Minimum character count for "non-text content" OCR skip gate. */
public readonly int $nonTextMinChars,
/** @var float Alphanumeric+whitespace ratio threshold for skip decisions. */
public readonly float $alnumWsRatioThreshold,
/**
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
* If the result from a backend scores below this, try the next backend.
*
* @var float
*/
public readonly float $pipelineMinQuality
) { }
public static function default(): OcrQualityThresholds
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A single backend stage in the OCR pipeline.
*/
final class OcrPipelineStage
{
public function __construct(
/** @var string Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
public readonly string $backend,
/** @var int Priority weight (higher = tried first). Stages are sorted by priority descending. */
public readonly int $priority,
/** @var ?string Language override for this stage (None = use parent OcrConfig.language). */
public readonly ?string $language = null,
/** @var ?TesseractConfig Tesseract-specific config override for this stage. */
public readonly ?TesseractConfig $tesseractConfig = null,
/** @var ?string PaddleOCR-specific config for this stage. */
public readonly ?string $paddleOcrConfig = null,
/** @var ?LlmConfig VLM config override for this pipeline stage. */
public readonly ?LlmConfig $vlmConfig = null,
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Backends that support runtime tuning (mode switching, preprocessing
* flags, inference parameters, etc.) read this value and deserialize
* the keys they care about. Keys unknown to the backend are silently
* ignored, so options from different backends can coexist in the same
* config without conflict.
*
* Example (custom backend):
* ```json
* { "mode": "fast", "enable_layout": true }
* ```
*
* @var ?string
*/
public readonly ?string $backendOptions = null
) { }
}
/**
* Multi-backend OCR pipeline with quality-based fallback.
*
* Backends are tried in priority order (highest first). After each backend
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
* the result is accepted. Otherwise the next backend is tried.
*/
final class OcrPipelineConfig
{
public function __construct(
/** @var array<OcrPipelineStage> Ordered list of backends to try. Sorted by priority (descending) at runtime. */
public readonly array $stages,
/** @var OcrQualityThresholds Quality thresholds for deciding whether to accept a result or try the next backend. */
public readonly OcrQualityThresholds $qualityThresholds
) { }
}
/**
* OCR configuration.
*/
final class OcrConfig
{
public function __construct(
/**
* Whether OCR is enabled.
*
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
* metadata only; PDFs use native text extraction without OCR fallback.
*
* Defaults to `true`. When `false`, all other OCR settings are ignored.
*
* @var bool
*/
public readonly bool $enabled,
/** @var string OCR backend: tesseract, easyocr, paddleocr */
public readonly string $backend,
/** @var string Language code (e.g., "eng", "deu") */
public readonly string $language,
/**
* Enable automatic page rotation based on orientation detection.
*
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
* page orientation (0/90/180/270 degrees) before OCR. If the page is
* rotated with high confidence, the image is corrected before recognition.
* This is critical for handling rotated scanned documents.
*
* @var bool
*/
public readonly bool $autoRotate,
/** @var ?TesseractConfig Tesseract-specific configuration (optional) */
public readonly ?TesseractConfig $tesseractConfig = null,
/** @var ?OutputFormat Output format for OCR results (optional, for format conversion) */
public readonly ?OutputFormat $outputFormat = null,
/** @var ?string PaddleOCR-specific configuration (optional, JSON passthrough) */
public readonly ?string $paddleOcrConfig = null,
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Custom OCR backends and built-in backends that support runtime tuning
* can read this value and deserialize the keys they care about. Keys
* unknown to the backend are silently ignored.
*
* This is the recommended extension point for per-call parameters that
* are not covered by the typed fields above (e.g. mode switching,
* preprocessing flags, inference batch size).
*
* **Scope:** when `pipeline` is `None`, this value is propagated to the
* primary stage of the auto-constructed pipeline. When `pipeline` is
* explicitly set, this field has **no effect** the caller must set
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
* instead.
*
* Example:
* ```json
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
* ```
*
* @var ?string
*/
public readonly ?string $backendOptions = null,
/** @var ?OcrElementConfig OCR element extraction configuration */
public readonly ?OcrElementConfig $elementConfig = null,
/**
* Quality thresholds for the native-text-to-OCR fallback decision.
* When None, uses compiled defaults (matching previous hardcoded behavior).
*
* @var ?OcrQualityThresholds
*/
public readonly ?OcrQualityThresholds $qualityThresholds = null,
/**
* Multi-backend OCR pipeline configuration. When set, enables weighted
* fallback across multiple OCR backends based on output quality.
* When None, uses the single `backend` field (same as today).
*
* @var ?OcrPipelineConfig
*/
public readonly ?OcrPipelineConfig $pipeline = null,
/**
* VLM (Vision Language Model) OCR configuration.
*
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
* images to a vision model for text extraction.
*
* @var ?LlmConfig
*/
public readonly ?LlmConfig $vlmConfig = null,
/**
* Custom Jinja2 prompt template for VLM OCR.
*
* When `None`, uses the default template. Available variables:
* - `{{ language }}` The document language code (e.g., "eng", "deu").
*
* @var ?string
*/
public readonly ?string $vlmPrompt = null,
/**
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
*
* Not user-configurable via config files injected at runtime from
* `ExtractionConfig::acceleration` before each `process_image` call.
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Caller-supplied Tesseract `traineddata` bytes per language code.
*
* Primary use case is the WASM build, which has no filesystem and cannot
* download tessdata at runtime. Native builds typically rely on
* `TessdataManager` and ignore this field. When present, the WASM
* Tesseract backend prefers these bytes over its compile-time-bundled
* English data.
*
* Skipped by serde to keep config files small supply via the typed API
* at runtime.
*
* @var ?array<string, string>
*/
public readonly ?array $tessdataBytes = null
) { }
public static function default(): OcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Page extraction and tracking configuration.
*
* Controls how pages are extracted, tracked, and represented in the extraction results.
* When `None`, page tracking is disabled.
*
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
* when page boundaries are available and chunking is configured.
*/
final class PageConfig
{
public function __construct(
/** @var bool Extract pages as separate array (ExtractionResult.pages) */
public readonly bool $extractPages,
/** @var bool Insert page markers in main content string */
public readonly bool $insertPageMarkers,
/**
* Page marker format (use {page_num} placeholder)
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
*
* @var string
*/
public readonly string $markerFormat
) { }
public static function default(): PageConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* PDF-specific configuration.
*/
final class PdfConfig
{
public function __construct(
/** @var bool Extract images from PDF */
public readonly bool $extractImages,
/**
* Extract tables from PDF.
*
* When `true` (default), runs pdf_oxide's native grid detector and, if it
* finds nothing, falls back to the heuristic text-layer reconstruction in
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
* both passes `tables` will then be empty in the result.
*
* @var bool
*/
public readonly bool $extractTables,
/** @var bool Extract PDF metadata */
public readonly bool $extractMetadata,
/**
* Extract PDF annotations (text notes, highlights, links, stamps).
* Default: false
*
* @var bool
*/
public readonly bool $extractAnnotations,
/**
* Allow single-column pseudo tables in extraction results.
*
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
* to 1, allowing single-column structured data (glossaries, itemized lists)
* to be emitted as tables. Other quality filters (density, sparsity, prose
* detection) still apply.
*
* @var bool
*/
public readonly bool $allowSingleColumnTables,
/**
* Perform OCR on inline images extracted from PDF pages and attach the
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
* falls back to `TesseractConfig::default()`. Per-image failures degrade
* gracefully (the image is returned without OCR text rather than failing
* the whole extraction). Default: `false`.
*
* @var bool
*/
public readonly bool $ocrInlineImages,
/** @var ?array<string> List of passwords to try when opening encrypted PDFs */
public readonly ?array $passwords = null,
/** @var ?HierarchyConfig Hierarchy extraction configuration (None = hierarchy extraction disabled) */
public readonly ?HierarchyConfig $hierarchy = null,
/**
* Top margin fraction (0.01.0) of page height to exclude headers/running heads.
* Default: 0.06 (6%)
*
* @var ?float
*/
public readonly ?float $topMarginFraction = null,
/**
* Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
* Default: 0.05 (5%)
*
* @var ?float
*/
public readonly ?float $bottomMarginFraction = null
) { }
public static function default(): PdfConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
final class HierarchyConfig
{
public function __construct(
/** @var bool Enable hierarchy extraction */
public readonly bool $enabled,
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*
* @var int
*/
public readonly int $kClusters,
/** @var bool Include bounding box information in hierarchy blocks */
public readonly bool $includeBbox,
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*
* @var ?float
*/
public readonly ?float $ocrCoverageThreshold = null
) { }
public static function default(): HierarchyConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Post-processor configuration.
*/
final class PostProcessorConfig
{
public function __construct(
/** @var bool Enable post-processors */
public readonly bool $enabled,
/** @var ?array<string> Whitelist of processor names to run (None = all enabled) */
public readonly ?array $enabledProcessors = null,
/** @var ?array<string> Blacklist of processor names to skip (None = none disabled) */
public readonly ?array $disabledProcessors = null,
/** @var ?array<string> Pre-computed AHashSet for O(1) enabled processor lookup */
public readonly ?array $enabledSet = null,
/** @var ?array<string> Pre-computed AHashSet for O(1) disabled processor lookup */
public readonly ?array $disabledSet = null
) { }
public static function default(): PostProcessorConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..Default.default()` when constructing to allow for future field additions:
*/
final class ChunkingConfig
{
public function __construct(
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*
* @var int
*/
public readonly int $maxCharacters,
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*
* @var int
*/
public readonly int $overlap,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*
* @var bool
*/
public readonly bool $trim,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*
* @var ChunkerType
*/
public readonly ChunkerType $chunkerType,
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*
* @var ChunkSizing
*/
public readonly ChunkSizing $sizing,
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*
* @var bool
*/
public readonly bool $prependHeadingContext,
/** @var ?EmbeddingConfig Optional embedding configuration for chunk embeddings. */
public readonly ?EmbeddingConfig $embedding = null,
/** @var ?string Use a preset configuration (overrides individual settings if provided). */
public readonly ?string $preset = null,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*
* @var ?float
*/
public readonly ?float $topicThreshold = null
) { }
public static function default(): ChunkingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
final class EmbeddingConfig
{
public function __construct(
/** @var EmbeddingModelType The embedding model to use (defaults to "balanced" preset if not specified) */
public readonly EmbeddingModelType $model,
/** @var bool Whether to normalize embedding vectors (recommended for cosine similarity) */
public readonly bool $normalize,
/** @var int Batch size for embedding generation */
public readonly int $batchSize,
/** @var bool Show model download progress */
public readonly bool $showDownloadProgress,
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*
* @var ?string
*/
public readonly ?string $cacheDir = null,
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using [`EmbeddingModelType::Plugin`].
*
* Applies only to the in-process plugin path protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `None` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*
* @var ?int
*/
public readonly ?int $maxEmbedDurationSecs = null
) { }
public static function default(): EmbeddingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for tree-sitter language pack integration.
*
* Controls grammar download behavior and code analysis options.
*
* # Example (TOML)
*
* ```toml
* [tree_sitter]
* languages = ["python", "rust"]
* groups = ["web"]
*
* [tree_sitter.process]
* structure = true
* comments = true
* docstrings = true
* ```
*/
final class TreeSitterConfig
{
public function __construct(
/**
* Enable code intelligence processing (default: true).
*
* When `false`, tree-sitter analysis is completely skipped even if
* the config section is present.
*
* @var bool
*/
public readonly bool $enabled,
/** @var TreeSitterProcessConfig Processing options for code analysis. */
public readonly TreeSitterProcessConfig $process,
/**
* Custom cache directory for downloaded grammars.
*
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
*
* @var ?string
*/
public readonly ?string $cacheDir = null,
/** @var ?array<string> Languages to pre-download on init (e.g., `["python", "rust"]`). */
public readonly ?array $languages = null,
/** @var ?array<string> Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
public readonly ?array $groups = null
) { }
public static function default(): TreeSitterConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Processing options for tree-sitter code analysis.
*
* Controls which analysis features are enabled when extracting code files.
*/
final class TreeSitterProcessConfig
{
public function __construct(
/** @var bool Extract structural items (functions, classes, structs, etc.). Default: true. */
public readonly bool $structure,
/** @var bool Extract import statements. Default: true. */
public readonly bool $imports,
/** @var bool Extract export statements. Default: true. */
public readonly bool $exports,
/** @var bool Extract comments. Default: false. */
public readonly bool $comments,
/** @var bool Extract docstrings. Default: false. */
public readonly bool $docstrings,
/** @var bool Extract symbol definitions. Default: false. */
public readonly bool $symbols,
/** @var bool Include parse diagnostics. Default: false. */
public readonly bool $diagnostics,
/** @var CodeContentMode Content rendering mode for code extraction. */
public readonly CodeContentMode $contentMode,
/** @var ?int Maximum chunk size in bytes. `None` disables chunking. */
public readonly ?int $chunkMaxSize = null
) { }
public static function default(): TreeSitterProcessConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A supported document format entry.
*
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
*/
final class SupportedFormat
{
public function __construct(
/** @var string File extension (without leading dot), e.g., "pdf", "docx" */
public readonly string $extension,
/** @var string MIME type string, e.g., "application/pdf" */
public readonly string $mimeType
) { }
}
/**
* API server configuration.
*
* This struct holds all configuration options for the Kreuzberg API server,
* including host/port settings, CORS configuration, and upload limits.
*
* # Defaults
*
* - `host`: "127.0.0.1" (localhost only)
* - `port`: 8000
* - `cors_origins`: empty vector (allows all origins)
* - `max_request_body_bytes`: 104_857_600 (100 MB)
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
*/
final class ServerConfig
{
public function __construct(
/** @var string Server host address (e.g., "127.0.0.1", "0.0.0.0") */
public readonly string $host,
/** @var int Server port number */
public readonly int $port,
/**
* CORS allowed origins. Empty vector means allow all origins.
*
* If this is an empty vector, the server will accept requests from any origin.
* If populated with specific origins (e.g., `"https://example.com"`), only
* those origins will be allowed.
*
* @var array<string>
*/
public readonly array $corsOrigins,
/** @var int Maximum size of request body in bytes (default: 100 MB) */
public readonly int $maxRequestBodyBytes,
/** @var int Maximum size of multipart fields in bytes (default: 100 MB) */
public readonly int $maxMultipartFieldBytes
) { }
public static function default(): ServerConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function listenAddr(): string
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function corsAllowsAll(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function isOriginAllowed(string $origin): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function maxRequestBodyMb(): int
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function maxMultipartFieldMb(): int
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
final class StructuredDataResult
{
public function __construct(
/** @var string */
public readonly string $content,
/** @var string */
public readonly string $format,
/** @var array<string, string> */
public readonly array $metadata,
/** @var array<string> */
public readonly array $textFields
) { }
}
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
final class DocxAppProperties
{
public function __construct(
/** @var ?string Application name (e.g., "Microsoft Office Word") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?string Template filename */
public readonly ?string $template = null,
/** @var ?int Total editing time in minutes */
public readonly ?int $totalTime = null,
/** @var ?int Number of pages */
public readonly ?int $pages = null,
/** @var ?int Number of words */
public readonly ?int $words = null,
/** @var ?int Number of characters (excluding spaces) */
public readonly ?int $characters = null,
/** @var ?int Number of characters (including spaces) */
public readonly ?int $charactersWithSpaces = null,
/** @var ?int Number of lines */
public readonly ?int $lines = null,
/** @var ?int Number of paragraphs */
public readonly ?int $paragraphs = null,
/** @var ?string Company name */
public readonly ?string $company = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null
) { }
}
/**
* Application properties from docProps/app.xml for XLSX
*
* Contains Excel-specific document metadata.
*/
final class XlsxAppProperties
{
public function __construct(
/** @var array<string> Worksheet names */
public readonly array $worksheetNames,
/** @var ?string Application name (e.g., "Microsoft Excel") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null,
/** @var ?string Company name */
public readonly ?string $company = null
) { }
}
/**
* Application properties from docProps/app.xml for PPTX
*
* Contains PowerPoint-specific document metadata.
*/
final class PptxAppProperties
{
public function __construct(
/** @var array<string> Slide titles */
public readonly array $slideTitles,
/** @var ?string Application name (e.g., "Microsoft Office PowerPoint") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?int Total editing time in minutes */
public readonly ?int $totalTime = null,
/** @var ?string Company name */
public readonly ?string $company = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null,
/** @var ?int Number of slides */
public readonly ?int $slides = null,
/** @var ?int Number of notes */
public readonly ?int $notes = null,
/** @var ?int Number of hidden slides */
public readonly ?int $hiddenSlides = null,
/** @var ?int Number of multimedia clips */
public readonly ?int $multimediaClips = null,
/** @var ?string Presentation format (e.g., "Widescreen", "Standard") */
public readonly ?string $presentationFormat = null
) { }
}
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
final class CoreProperties
{
public function __construct(
/** @var ?string Document title */
public readonly ?string $title = null,
/** @var ?string Document subject/topic */
public readonly ?string $subject = null,
/** @var ?string Document creator/author */
public readonly ?string $creator = null,
/** @var ?string Keywords or tags */
public readonly ?string $keywords = null,
/** @var ?string Document description/abstract */
public readonly ?string $description = null,
/** @var ?string User who last modified the document */
public readonly ?string $lastModifiedBy = null,
/** @var ?string Revision number */
public readonly ?string $revision = null,
/** @var ?string Creation timestamp (ISO 8601) */
public readonly ?string $created = null,
/** @var ?string Last modification timestamp (ISO 8601) */
public readonly ?string $modified = null,
/** @var ?string Document category */
public readonly ?string $category = null,
/** @var ?string Content status (Draft, Final, etc.) */
public readonly ?string $contentStatus = null,
/** @var ?string Document language */
public readonly ?string $language = null,
/** @var ?string Unique identifier */
public readonly ?string $identifier = null,
/** @var ?string Document version */
public readonly ?string $version = null,
/** @var ?string Last print timestamp (ISO 8601) */
public readonly ?string $lastPrinted = null
) { }
}
/**
* Configuration for security limits across extractors.
*
* All limits are intentionally conservative to prevent DoS attacks
* while still supporting legitimate documents.
*/
final class SecurityLimits
{
public function __construct(
/** @var int Maximum uncompressed size for archives (500 MB) */
public readonly int $maxArchiveSize,
/** @var int Maximum compression ratio before flagging as potential bomb (100:1) */
public readonly int $maxCompressionRatio,
/** @var int Maximum number of files in archive (10,000) */
public readonly int $maxFilesInArchive,
/** @var int Maximum nesting depth for structures (100) */
public readonly int $maxNestingDepth,
/**
* Maximum length of any single XML entity / attribute / token (1 MiB).
* This is a per-token cap, NOT a total cap billion-laughs class
* attacks where a single entity expands to hundreds of MB are caught
* here, while normal long text content (a paragraph, a CDATA block) is
* caught by `max_content_size` instead.
*
* @var int
*/
public readonly int $maxEntityLength,
/** @var int Maximum string growth per document (100 MB) */
public readonly int $maxContentSize,
/** @var int Maximum iterations per operation */
public readonly int $maxIterations,
/** @var int Maximum XML depth (100 levels) */
public readonly int $maxXmlDepth,
/** @var int Maximum cells per table (100,000) */
public readonly int $maxTableCells
) { }
public static function default(): SecurityLimits
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
final class TokenReductionConfig
{
public function __construct(
/** @var ReductionLevel */
public readonly ReductionLevel $level,
/** @var bool */
public readonly bool $preserveMarkdown,
/** @var bool */
public readonly bool $preserveCode,
/** @var float */
public readonly float $semanticThreshold,
/** @var bool */
public readonly bool $enableParallel,
/** @var bool */
public readonly bool $useSimd,
/** @var array<string> */
public readonly array $preservePatterns,
/** @var bool */
public readonly bool $enableSemanticClustering,
/** @var ?string */
public readonly ?string $languageHint = null,
/** @var ?array<string, array<string>> */
public readonly ?array $customStopwords = null,
/** @var ?float */
public readonly ?float $targetReduction = null
) { }
public static function default(): TokenReductionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A PDF annotation extracted from a document page.
*/
final class PdfAnnotation
{
public function __construct(
/** @var PdfAnnotationType The type of annotation. */
public readonly PdfAnnotationType $annotationType,
/** @var int Page number where the annotation appears (1-indexed). */
public readonly int $pageNumber,
/** @var ?string Text content of the annotation (e.g., comment text, link URL). */
public readonly ?string $content = null,
/** @var ?BoundingBox Bounding box of the annotation on the page. */
public readonly ?BoundingBox $boundingBox = null
) { }
}
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
final class DjotContent
{
public function __construct(
/** @var string Plain text representation for backwards compatibility */
public readonly string $plainText,
/** @var array<FormattedBlock> Structured block-level content */
public readonly array $blocks,
/** @var Metadata Metadata from YAML frontmatter */
public readonly Metadata $metadata,
/** @var array<Table> Extracted tables as structured data */
public readonly array $tables,
/** @var array<DjotImage> Extracted images with metadata */
public readonly array $images,
/** @var array<DjotLink> Extracted links with URLs */
public readonly array $links,
/** @var array<Footnote> Footnote definitions */
public readonly array $footnotes,
/** @var array<string> Attributes mapped by element identifier (if present) */
public readonly array $attributes
) { }
}
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
final class FormattedBlock
{
public function __construct(
/** @var BlockType Type of block element */
public readonly BlockType $blockType,
/** @var array<InlineElement> Inline content within the block */
public readonly array $inlineContent,
/** @var array<FormattedBlock> Nested blocks for containers (blockquotes, list items, divs) */
public readonly array $children,
/** @var ?int Heading level (1-6) for headings, or nesting level for lists */
public readonly ?int $level = null,
/** @var ?string Element attributes (classes, IDs, key-value pairs) */
public readonly ?string $attributes = null,
/** @var ?string Language identifier for code blocks */
public readonly ?string $language = null,
/** @var ?string Raw code content for code blocks */
public readonly ?string $code = null
) { }
}
/**
* Inline element within a block.
*
* Represents text with formatting, links, images, etc.
*/
final class InlineElement
{
public function __construct(
/** @var InlineType Type of inline element */
public readonly InlineType $elementType,
/** @var string Text content */
public readonly string $content,
/** @var ?string Element attributes */
public readonly ?string $attributes = null,
/** @var ?array<string, string> Additional metadata (e.g., href for links, src/alt for images) */
public readonly ?array $metadata = null
) { }
}
/**
* Image element in Djot.
*/
final class DjotImage
{
public function __construct(
/** @var string Image source URL or path */
public readonly string $src,
/** @var string Alternative text */
public readonly string $alt,
/** @var ?string Optional title */
public readonly ?string $title = null,
/** @var ?string Element attributes */
public readonly ?string $attributes = null
) { }
}
/**
* Link element in Djot.
*/
final class DjotLink
{
public function __construct(
/** @var string Link URL */
public readonly string $url,
/** @var string Link text content */
public readonly string $text,
/** @var ?string Optional title */
public readonly ?string $title = null,
/** @var ?string Element attributes */
public readonly ?string $attributes = null
) { }
}
/**
* Footnote in Djot.
*/
final class Footnote
{
public function __construct(
/** @var string Footnote label */
public readonly string $label,
/** @var array<FormattedBlock> Footnote content blocks */
public readonly array $content
) { }
}
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
final class DocumentStructure
{
public function __construct(
/** @var array<DocumentNode> All nodes in document/reading order. */
public readonly array $nodes,
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*
* @var array<DocumentRelationship>
*/
public readonly array $relationships,
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, ).
*
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*
* @var array<string>
*/
public readonly array $nodeTypes,
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*
* @var ?string
*/
public readonly ?string $sourceFormat = null
) { }
public function finalizeNodeTypes(): void
{ }
public function isEmpty(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public static function default(): DocumentStructure
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A resolved relationship between two nodes in the document tree.
*/
final class DocumentRelationship
{
public function __construct(
/** @var int Source node index (the referencing node). */
public readonly int $source,
/** @var int Target node index (the referenced node). */
public readonly int $target,
/** @var RelationshipKind Semantic kind of the relationship. */
public readonly RelationshipKind $kind
) { }
}
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
final class DocumentNode
{
public function __construct(
/** @var string Deterministic identifier (hash of content + position). */
public readonly string $id,
/** @var NodeContent Node content — tagged enum, type-specific data only. */
public readonly NodeContent $content,
/** @var array<int> Child node indices in reading order. */
public readonly array $children,
/** @var ContentLayer Content layer classification. */
public readonly ContentLayer $contentLayer,
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*
* @var array<TextAnnotation>
*/
public readonly array $annotations,
/** @var ?int Parent node index (`None` = root-level node). */
public readonly ?int $parent = null,
/** @var ?int Page number where this node starts (1-indexed). */
public readonly ?int $page = null,
/** @var ?int Page number where this node ends (for multi-page tables/sections). */
public readonly ?int $pageEnd = null,
/** @var ?BoundingBox Bounding box in document coordinates. */
public readonly ?BoundingBox $bbox = null,
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*
* @var ?array<string, string>
*/
public readonly ?array $attributes = null
) { }
}
/**
* Structured table grid with cell-level metadata.
*
* Stores row/column dimensions and a flat list of cells with position info.
*/
final class TableGrid
{
public function __construct(
/** @var int Number of rows in the table. */
public readonly int $rows,
/** @var int Number of columns in the table. */
public readonly int $cols,
/** @var array<GridCell> All cells in row-major order. */
public readonly array $cells
) { }
}
/**
* Individual grid cell with position and span metadata.
*/
final class GridCell
{
public function __construct(
/** @var string Cell text content. */
public readonly string $content,
/** @var int Zero-indexed row position. */
public readonly int $row,
/** @var int Zero-indexed column position. */
public readonly int $col,
/** @var int Number of rows this cell spans. */
public readonly int $rowSpan,
/** @var int Number of columns this cell spans. */
public readonly int $colSpan,
/** @var bool Whether this is a header cell. */
public readonly bool $isHeader,
/** @var ?BoundingBox Bounding box for this cell (if available). */
public readonly ?BoundingBox $bbox = null
) { }
}
/**
* Inline text annotation byte-range based formatting and links.
*
* Annotations reference byte offsets into the node's text content,
* enabling precise identification of formatted regions.
*/
final class TextAnnotation
{
public function __construct(
/** @var int Start byte offset in the node's text content (inclusive). */
public readonly int $start,
/** @var int End byte offset in the node's text content (exclusive). */
public readonly int $end,
/** @var AnnotationKind Annotation type. */
public readonly AnnotationKind $kind
) { }
}
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
final class ExtractionResult
{
public function __construct(
/** @var string */
public readonly string $content,
/** @var string */
public readonly string $mimeType,
/** @var Metadata */
public readonly Metadata $metadata,
/** @var array<Table> */
public readonly array $tables,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*
* @var array<ProcessingWarning>
*/
public readonly array $processingWarnings,
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*
* @var ?ExtractionMethod
*/
public readonly ?ExtractionMethod $extractionMethod = null,
/** @var ?array<string> */
public readonly ?array $detectedLanguages = null,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*
* @var ?array<Chunk>
*/
public readonly ?array $chunks = null,
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*
* @var ?array<ExtractedImage>
*/
public readonly ?array $images = null,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*
* @var ?array<PageContent>
*/
public readonly ?array $pages = null,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*
* @var ?array<Element>
*/
public readonly ?array $elements = null,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `None` for non-Djot documents.
*
* @var ?DjotContent
*/
public readonly ?DjotContent $djotContent = null,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*
* @var ?array<OcrElement>
*/
public readonly ?array $ocrElements = null,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` can be combined with Unified or ElementBased.
*
* @var ?DocumentStructure
*/
public readonly ?DocumentStructure $document = null,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*
* @var ?array<Keyword>
*/
public readonly ?array $extractedKeywords = null,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*
* @var ?float
*/
public readonly ?float $qualityScore = null,
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*
* @var ?array<PdfAnnotation>
*/
public readonly ?array $annotations = null,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `None` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*
* @var ?array<ArchiveEntry>
*/
public readonly ?array $children = null,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*
* @var ?array<ExtractedUri>
*/
public readonly ?array $uris = null,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* ). Every extractor defaults to `None` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent the revision list is the separate
* audit trail.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*
* @var ?string
*/
public readonly ?string $structuredOutput = null,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
*
* @var ?string
*/
public readonly ?string $codeIntelligence = null,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `None` when no LLM was used.
*
* @var ?array<LlmUsage>
*/
public readonly ?array $llmUsage = null,
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*
* @var ?string
*/
public readonly ?string $formattedContent = null,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*
* @var ?string
*/
public readonly ?string $ocrInternalDocument = null
) { }
public static function fromOcr(OcrExtractionResult $ocr): ExtractionResult
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
final class ArchiveEntry
{
public function __construct(
/** @var string Archive-relative file path (e.g. "folder/document.pdf"). */
public readonly string $path,
/** @var string Detected MIME type of the file. */
public readonly string $mimeType,
/** @var ExtractionResult Full extraction result for this file. */
public readonly ExtractionResult $result
) { }
}
/**
* A non-fatal warning from a processing pipeline stage.
*
* Captures errors from optional features that don't prevent extraction
* but may indicate degraded results.
*/
final class ProcessingWarning
{
public function __construct(
/**
* The pipeline stage or feature that produced this warning
* (e.g., "embedding", "chunking", "language_detection", "output_format").
*
* @var string
*/
public readonly string $source,
/** @var string Human-readable description of what went wrong. */
public readonly string $message
) { }
}
/**
* Token usage and cost data for a single LLM call made during extraction.
*
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
* are used. Multiple entries may be present when multiple LLM calls occur
* within one extraction (e.g. VLM OCR + structured extraction).
*/
final class LlmUsage
{
public function __construct(
/** @var string The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
public readonly string $model,
/**
* The pipeline stage that triggered this LLM call
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
*
* @var string
*/
public readonly string $source,
/** @var ?int Number of input/prompt tokens consumed. */
public readonly ?int $inputTokens = null,
/** @var ?int Number of output/completion tokens generated. */
public readonly ?int $outputTokens = null,
/** @var ?int Total tokens (input + output). */
public readonly ?int $totalTokens = null,
/** @var ?float Estimated cost in USD based on the provider's published pricing. */
public readonly ?float $estimatedCost = null,
/** @var ?string Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
public readonly ?string $finishReason = null
) { }
}
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
final class Chunk
{
public function __construct(
/** @var string The text content of this chunk. */
public readonly string $content,
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
*
* @var ChunkType
*/
public readonly ChunkType $chunkType,
/** @var ChunkMetadata Metadata about this chunk's position and properties. */
public readonly ChunkMetadata $metadata,
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*
* @var ?array<float>
*/
public readonly ?array $embedding = null
) { }
}
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
final class HeadingContext
{
public function __construct(
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*
* @var array<HeadingLevel>
*/
public readonly array $headings
) { }
}
/**
* A single heading in the hierarchy.
*/
final class HeadingLevel
{
public function __construct(
/** @var int Heading depth (1 = h1, 2 = h2, etc.) */
public readonly int $level,
/** @var string The text content of the heading. */
public readonly string $text
) { }
}
/**
* Metadata about a chunk's position in the original document.
*/
final class ChunkMetadata
{
public function __construct(
/** @var int Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
public readonly int $byteStart,
/** @var int Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
public readonly int $byteEnd,
/** @var int Zero-based index of this chunk in the document. */
public readonly int $chunkIndex,
/** @var int Total number of chunks in the document. */
public readonly int $totalChunks,
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*
* @var array<int>
*/
public readonly array $imageIndices,
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*
* @var ?int
*/
public readonly ?int $tokenCount = null,
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*
* @var ?int
*/
public readonly ?int $firstPage = null,
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*
* @var ?int
*/
public readonly ?int $lastPage = null,
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType::Markdown` is used.
*
* @var ?HeadingContext
*/
public readonly ?HeadingContext $headingContext = null
) { }
}
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
final class ExtractedImage
{
public function __construct(
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*
* @var string
*/
public readonly string $data,
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*
* @var string
*/
public readonly string $format,
/** @var int Zero-indexed position of this image in the document/page */
public readonly int $imageIndex,
/** @var bool Whether this image is a mask image */
public readonly bool $isMask,
/** @var ?int Page/slide number where image was found (1-indexed) */
public readonly ?int $pageNumber = null,
/** @var ?int Image width in pixels */
public readonly ?int $width = null,
/** @var ?int Image height in pixels */
public readonly ?int $height = null,
/** @var ?string Colorspace information (e.g., "RGB", "CMYK", "Gray") */
public readonly ?string $colorspace = null,
/** @var ?int Bits per color component (e.g., 8, 16) */
public readonly ?int $bitsPerComponent = null,
/** @var ?string Optional description of the image */
public readonly ?string $description = null,
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*
* @var ?ExtractionResult
*/
public readonly ?ExtractionResult $ocrResult = null,
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*
* @var ?BoundingBox
*/
public readonly ?BoundingBox $boundingBox = null,
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*
* @var ?string
*/
public readonly ?string $sourcePath = null,
/**
* Heuristic classification of what this image likely depicts.
* `None` if classification was disabled or inconclusive.
*
* @var ?ImageKind
*/
public readonly ?ImageKind $imageKind = null,
/** @var ?float Confidence score for `image_kind`, in the range 0.0 to 1.0. */
public readonly ?float $kindConfidence = null,
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
*
* @var ?int
*/
public readonly ?int $clusterId = null
) { }
}
/**
* Bounding box coordinates for element positioning.
*/
final class BoundingBox
{
public function __construct(
/** @var float Left x-coordinate */
public readonly float $x0,
/** @var float Bottom y-coordinate */
public readonly float $y0,
/** @var float Right x-coordinate */
public readonly float $x1,
/** @var float Top y-coordinate */
public readonly float $y1
) { }
}
/**
* Metadata for a semantic element.
*/
final class ElementMetadata
{
public function __construct(
/** @var array<string, string> Additional custom metadata */
public readonly array $additional,
/** @var ?int Page number (1-indexed) */
public readonly ?int $pageNumber = null,
/** @var ?string Source filename or document name */
public readonly ?string $filename = null,
/** @var ?BoundingBox Bounding box coordinates if available */
public readonly ?BoundingBox $coordinates = null,
/** @var ?int Position index in the element sequence */
public readonly ?int $elementIndex = null
) { }
}
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
final class Element
{
public function __construct(
/** @var string Unique element identifier */
public readonly string $elementId,
/** @var ElementType Semantic type of this element */
public readonly ElementType $elementType,
/** @var string Text content of the element */
public readonly string $text,
/** @var ElementMetadata Metadata about the element */
public readonly ElementMetadata $metadata
) { }
}
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
final class ExcelWorkbook
{
public function __construct(
/** @var array<ExcelSheet> All sheets in the workbook */
public readonly array $sheets,
/** @var array<string, string> Workbook-level metadata (author, creation date, etc.) */
public readonly array $metadata,
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* ( `revision_id`), `userName` ( `author`), and `dateTime` ( `timestamp`).
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null
) { }
}
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
final class ExcelSheet
{
public function __construct(
/** @var string Sheet name as it appears in Excel */
public readonly string $name,
/** @var string Sheet content converted to Markdown tables */
public readonly string $markdown,
/** @var int Number of rows */
public readonly int $rowCount,
/** @var int Number of columns */
public readonly int $colCount,
/** @var int Total number of non-empty cells */
public readonly int $cellCount,
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*
* @var ?array<array<string>>
*/
public readonly ?array $tableCells = null
) { }
}
/**
* XML extraction result.
*
* Contains extracted text content from XML files along with
* structural statistics about the XML document.
*/
final class XmlExtractionResult
{
public function __construct(
/** @var string Extracted text content (XML structure filtered out) */
public readonly string $content,
/** @var int Total number of XML elements processed */
public readonly int $elementCount,
/** @var array<string> List of unique element names found (sorted) */
public readonly array $uniqueElements
) { }
}
/**
* Plain text and Markdown extraction result.
*
* Contains the extracted text along with statistics and,
* for Markdown files, structural elements like headers and links.
*/
final class TextExtractionResult
{
public function __construct(
/** @var string Extracted text content */
public readonly string $content,
/** @var int Number of lines */
public readonly int $lineCount,
/** @var int Number of words */
public readonly int $wordCount,
/** @var int Number of characters */
public readonly int $characterCount,
/** @var ?array<string> Markdown headers (text only, Markdown files only) */
public readonly ?array $headers = null,
/** @var ?array<array<string>> Markdown links as (text, URL) tuples (Markdown files only) */
public readonly ?array $links = null,
/** @var ?array<array<string>> Code blocks as (language, code) tuples (Markdown files only) */
public readonly ?array $codeBlocks = null
) { }
}
/**
* PowerPoint (PPTX) extraction result.
*
* Contains extracted slide content, metadata, and embedded images/tables.
*/
final class PptxExtractionResult
{
public function __construct(
/** @var string Extracted text content from all slides */
public readonly string $content,
/** @var PptxMetadata Presentation metadata */
public readonly PptxMetadata $metadata,
/** @var int Total number of slides */
public readonly int $slideCount,
/** @var int Total number of embedded images */
public readonly int $imageCount,
/** @var int Total number of tables */
public readonly int $tableCount,
/** @var array<ExtractedImage> Extracted images from the presentation */
public readonly array $images,
/** @var array<string> Hyperlinks discovered in slides as (url, optional_label) pairs. */
public readonly array $hyperlinks,
/**
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
*
* Contains keys like "title", "author", "created_by", "subject", "keywords",
* "modified_by", "created_at", "modified_at", etc.
*
* @var array<string, string>
*/
public readonly array $officeMetadata,
/** @var ?PageStructure Slide structure with boundaries (when page tracking is enabled) */
public readonly ?PageStructure $pageStructure = null,
/** @var ?array<PageContent> Per-slide content (when page tracking is enabled) */
public readonly ?array $pageContents = null,
/** @var ?DocumentStructure Structured document representation */
public readonly ?DocumentStructure $document = null,
/**
* Slide comments as revisions.
*
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
* `DocumentRevision { kind: Comment }` with author (resolved from
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null
) { }
}
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
final class EmailExtractionResult
{
public function __construct(
/** @var array<string> Primary recipient email addresses */
public readonly array $toEmails,
/** @var array<string> CC recipient email addresses */
public readonly array $ccEmails,
/** @var array<string> BCC recipient email addresses */
public readonly array $bccEmails,
/** @var string Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
public readonly string $content,
/** @var array<EmailAttachment> List of email attachments */
public readonly array $attachments,
/** @var array<string, string> Additional email headers and metadata */
public readonly array $metadata,
/** @var ?string Email subject line */
public readonly ?string $subject = null,
/** @var ?string Sender email address */
public readonly ?string $fromEmail = null,
/** @var ?string Email date/timestamp */
public readonly ?string $date = null,
/** @var ?string Message-ID header value */
public readonly ?string $messageId = null,
/** @var ?string Plain text version of the email body */
public readonly ?string $plainText = null,
/** @var ?string HTML version of the email body */
public readonly ?string $htmlContent = null
) { }
}
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
final class EmailAttachment
{
public function __construct(
/** @var bool Whether this attachment is an image */
public readonly bool $isImage,
/** @var ?string Attachment name (from Content-Disposition header) */
public readonly ?string $name = null,
/** @var ?string Filename of the attachment */
public readonly ?string $filename = null,
/** @var ?string MIME type of the attachment */
public readonly ?string $mimeType = null,
/** @var ?int Size in bytes */
public readonly ?int $size = null,
/**
* Attachment data (if extracted).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*
* @var ?string
*/
public readonly ?string $data = null
) { }
}
/**
* OCR extraction result.
*
* Result of performing OCR on an image or scanned document,
* including recognized text and detected tables.
*/
final class OcrExtractionResult
{
public function __construct(
/** @var string Recognized text content */
public readonly string $content,
/** @var string Original MIME type of the processed image */
public readonly string $mimeType,
/** @var array<string, string> OCR processing metadata (confidence scores, language, etc.) */
public readonly array $metadata,
/** @var array<OcrTable> Tables detected and extracted via OCR */
public readonly array $tables,
/**
* Structured OCR elements with bounding boxes and confidence scores.
* Available when TSV output is requested or table detection is enabled.
*
* @var ?array<OcrElement>
*/
public readonly ?array $ocrElements = null,
/**
* Structured document produced from hOCR parsing.
* Carries paragraph structure, bounding boxes, and confidence scores
* that the flattened `content` string discards.
*
* @var ?string
*/
public readonly ?string $internalDocument = null
) { }
}
/**
* Table detected via OCR.
*
* Represents a table structure recognized during OCR processing.
*/
final class OcrTable
{
public function __construct(
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
public readonly array $cells,
/** @var string Markdown representation of the table */
public readonly string $markdown,
/** @var int Page number where the table was found (1-indexed) */
public readonly int $pageNumber,
/** @var ?OcrTableBoundingBox Bounding box of the table in pixel coordinates (from OCR word positions). */
public readonly ?OcrTableBoundingBox $boundingBox = null
) { }
}
/**
* Bounding box for an OCR-detected table in pixel coordinates.
*/
final class OcrTableBoundingBox
{
public function __construct(
/** @var int Left x-coordinate (pixels) */
public readonly int $left,
/** @var int Top y-coordinate (pixels) */
public readonly int $top,
/** @var int Right x-coordinate (pixels) */
public readonly int $right,
/** @var int Bottom y-coordinate (pixels) */
public readonly int $bottom
) { }
}
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
final class ImagePreprocessingConfig
{
public function __construct(
/** @var int Target DPI for the image (300 is standard, 600 for small text). */
public readonly int $targetDpi,
/** @var bool Auto-detect and correct image rotation. */
public readonly bool $autoRotate,
/** @var bool Correct skew (tilted images). */
public readonly bool $deskew,
/** @var bool Remove noise from the image. */
public readonly bool $denoise,
/** @var bool Enhance contrast for better text visibility. */
public readonly bool $contrastEnhance,
/** @var string Binarization method: "otsu", "sauvola", "adaptive". */
public readonly string $binarizationMethod,
/** @var bool Invert colors (white text on black → black on white). */
public readonly bool $invertColors
) { }
public static function default(): ImagePreprocessingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
final class TesseractConfig
{
public function __construct(
/** @var string Language code (e.g., "eng", "deu", "fra") */
public readonly string $language,
/**
* Page Segmentation Mode (0-13).
*
* Common values:
* - 3: Fully automatic page segmentation (native default)
* - 6: Assume a single uniform block of text (WASM default avoids layout-analysis hang)
* - 11: Sparse text with no particular order
*
* @var int
*/
public readonly int $psm,
/** @var string Output format ("text" or "markdown") */
public readonly string $outputFormat,
/**
* OCR Engine Mode (0-3).
*
* - 0: Legacy engine only
* - 1: Neural nets (LSTM) only (usually best)
* - 2: Legacy + LSTM
* - 3: Default (based on what's available)
*
* @var int
*/
public readonly int $oem,
/**
* Minimum confidence threshold (0.0-100.0).
*
* Words with confidence below this threshold may be rejected or flagged.
*
* @var float
*/
public readonly float $minConfidence,
/** @var bool Enable automatic table detection and reconstruction */
public readonly bool $enableTableDetection,
/** @var float Minimum confidence threshold for table detection (0.0-1.0) */
public readonly float $tableMinConfidence,
/** @var int Column threshold for table detection (pixels) */
public readonly int $tableColumnThreshold,
/** @var float Row threshold ratio for table detection (0.0-1.0) */
public readonly float $tableRowThresholdRatio,
/** @var bool Enable OCR result caching */
public readonly bool $useCache,
/** @var bool Use pre-adapted templates for character classification */
public readonly bool $classifyUsePreAdaptedTemplates,
/** @var bool Enable N-gram language model */
public readonly bool $languageModelNgramOn,
/** @var bool Don't reject good words during block-level processing */
public readonly bool $tesseditDontBlkrejGoodWds,
/** @var bool Don't reject good words during row-level processing */
public readonly bool $tesseditDontRowrejGoodWds,
/** @var bool Enable dictionary correction */
public readonly bool $tesseditEnableDictCorrection,
/** @var string Whitelist of allowed characters (empty = all allowed) */
public readonly string $tesseditCharWhitelist,
/** @var string Blacklist of forbidden characters (empty = none forbidden) */
public readonly string $tesseditCharBlacklist,
/** @var bool Use primary language params model */
public readonly bool $tesseditUsePrimaryParamsModel,
/** @var bool Variable-width space detection */
public readonly bool $textordSpaceSizeIsVariable,
/** @var bool Use adaptive thresholding method */
public readonly bool $thresholdingMethod,
/**
* Image preprocessing configuration.
*
* Controls how images are preprocessed before OCR. Can significantly
* improve quality for scanned documents or low-quality images.
*
* @var ?ImagePreprocessingConfig
*/
public readonly ?ImagePreprocessingConfig $preprocessing = null
) { }
public static function default(): TesseractConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
final class ImagePreprocessingMetadata
{
public function __construct(
/** @var array<int> Original image dimensions (width, height) in pixels */
public readonly array $originalDimensions,
/** @var array<float> Original image DPI (horizontal, vertical) */
public readonly array $originalDpi,
/** @var int Target DPI from configuration */
public readonly int $targetDpi,
/** @var float Scaling factor applied to the image */
public readonly float $scaleFactor,
/** @var bool Whether DPI was auto-adjusted based on content */
public readonly bool $autoAdjusted,
/** @var int Final DPI after processing */
public readonly int $finalDpi,
/** @var string Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
public readonly string $resampleMethod,
/** @var bool Whether dimensions were clamped to max_image_dimension */
public readonly bool $dimensionClamped,
/** @var bool Whether resize was skipped (dimensions already optimal) */
public readonly bool $skippedResize,
/** @var ?array<int> New dimensions after resizing (if resized) */
public readonly ?array $newDimensions = null,
/** @var ?int Calculated optimal DPI (if auto_adjust_dpi enabled) */
public readonly ?int $calculatedDpi = null,
/** @var ?string Error message if resize failed */
public readonly ?string $resizeError = null
) { }
}
/**
* Extraction result metadata.
*
* Contains common fields applicable to all formats, format-specific metadata
* via a discriminated union, and additional custom fields from postprocessors.
*/
final class Metadata
{
public function __construct(
/**
* Whether OCR was used during extraction.
*
* Set to `true` whenever the extraction pipeline ran an OCR backend
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
* or fallback text. `false` means native text extraction was used exclusively.
*
* @var bool
*/
public readonly bool $ocrUsed,
/**
* Additional custom fields from postprocessors.
*
* Serialized as a nested `"additional"` object (not flattened at root level).
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
*
* @var array<string, string>
*/
public readonly array $additional,
/** @var ?string Document title */
public readonly ?string $title = null,
/** @var ?string Document subject or description */
public readonly ?string $subject = null,
/** @var ?array<string> Primary author(s) - always Vec for consistency */
public readonly ?array $authors = null,
/** @var ?array<string> Keywords/tags - always Vec for consistency */
public readonly ?array $keywords = null,
/** @var ?string Primary language (ISO 639 code) */
public readonly ?string $language = null,
/** @var ?string Creation timestamp (ISO 8601 format) */
public readonly ?string $createdAt = null,
/** @var ?string Last modification timestamp (ISO 8601 format) */
public readonly ?string $modifiedAt = null,
/** @var ?string User who created the document */
public readonly ?string $createdBy = null,
/** @var ?string User who last modified the document */
public readonly ?string $modifiedBy = null,
/** @var ?PageStructure Page/slide/sheet structure with boundaries */
public readonly ?PageStructure $pages = null,
/**
* Format-specific metadata (discriminated union)
*
* Contains detailed metadata specific to the document format.
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
*
* @var ?FormatMetadata
*/
public readonly ?FormatMetadata $format = null,
/** @var ?ImagePreprocessingMetadata Image preprocessing metadata (when OCR preprocessing was applied) */
public readonly ?ImagePreprocessingMetadata $imagePreprocessing = null,
/** @var ?string JSON schema (for structured data extraction) */
public readonly ?string $jsonSchema = null,
/** @var ?ErrorMetadata Error metadata (for batch operations) */
public readonly ?ErrorMetadata $error = null,
/**
* Extraction duration in milliseconds (for benchmarking).
*
* This field is populated by batch extraction to provide per-file timing
* information. It's `None` for single-file extraction (which uses external timing).
*
* @var ?int
*/
public readonly ?int $extractionDurationMs = null,
/** @var ?string Document category (from frontmatter or classification). */
public readonly ?string $category = null,
/** @var ?array<string> Document tags (from frontmatter). */
public readonly ?array $tags = null,
/** @var ?string Document version string (from frontmatter). */
public readonly ?string $documentVersion = null,
/** @var ?string Abstract or summary text (from frontmatter). */
public readonly ?string $abstractText = null,
/**
* Output format identifier (e.g., "markdown", "html", "text").
*
* Set by the output format pipeline stage when format conversion is applied.
* Previously stored in `metadata.additional["output_format"]`.
*
* @var ?string
*/
public readonly ?string $outputFormat = null
) { }
public function isEmpty(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
final class ExcelMetadata
{
public function __construct(
/** @var ?int Number of sheets in the workbook. */
public readonly ?int $sheetCount = null,
/** @var ?array<string> Names of all sheets in the workbook. */
public readonly ?array $sheetNames = null
) { }
}
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
final class EmailMetadata
{
public function __construct(
/** @var array<string> Primary recipients */
public readonly array $toEmails,
/** @var array<string> CC recipients */
public readonly array $ccEmails,
/** @var array<string> BCC recipients */
public readonly array $bccEmails,
/** @var array<string> List of attachment filenames */
public readonly array $attachments,
/** @var ?string Sender's email address */
public readonly ?string $fromEmail = null,
/** @var ?string Sender's display name */
public readonly ?string $fromName = null,
/** @var ?string Message-ID header value */
public readonly ?string $messageId = null
) { }
}
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
final class ArchiveMetadata
{
public function __construct(
/** @var string Archive format ("ZIP", "TAR", "7Z", etc.) */
public readonly string $format,
/** @var int Total number of files in the archive */
public readonly int $fileCount,
/** @var array<string> List of file paths within the archive */
public readonly array $fileList,
/** @var int Total uncompressed size in bytes */
public readonly int $totalSize,
/** @var ?int Compressed size in bytes (if available) */
public readonly ?int $compressedSize = null
) { }
}
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
final class ImageMetadata
{
public function __construct(
/** @var int Image width in pixels */
public readonly int $width,
/** @var int Image height in pixels */
public readonly int $height,
/** @var string Image format (e.g., "PNG", "JPEG", "TIFF") */
public readonly string $format,
/** @var array<string, string> EXIF metadata tags */
public readonly array $exif
) { }
}
/**
* XML metadata extracted during XML parsing.
*
* Provides statistics about XML document structure.
*/
final class XmlMetadata
{
public function __construct(
/** @var int Total number of XML elements processed */
public readonly int $elementCount,
/** @var array<string> List of unique element tag names (sorted) */
public readonly array $uniqueElements
) { }
}
/**
* Text/Markdown metadata.
*
* Extracted from plain text and Markdown files. Includes word counts and,
* for Markdown, structural elements like headers and links.
*/
final class TextMetadata
{
public function __construct(
/** @var int Number of lines in the document */
public readonly int $lineCount,
/** @var int Number of words */
public readonly int $wordCount,
/** @var int Number of characters */
public readonly int $characterCount,
/** @var ?array<string> Markdown headers (headings text only, for Markdown files) */
public readonly ?array $headers = null,
/** @var ?array<array<string>> Markdown links as (text, url) tuples (for Markdown files) */
public readonly ?array $links = null,
/** @var ?array<array<string>> Code blocks as (language, code) tuples (for Markdown files) */
public readonly ?array $codeBlocks = null
) { }
}
/**
* Header/heading element metadata.
*/
final class HeaderMetadata
{
public function __construct(
/** @var int Header level: 1 (h1) through 6 (h6) */
public readonly int $level,
/** @var string Normalized text content of the header */
public readonly string $text,
/** @var int Document tree depth at the header element */
public readonly int $depth,
/** @var int Byte offset in original HTML document */
public readonly int $htmlOffset,
/** @var ?string HTML id attribute if present */
public readonly ?string $id = null
) { }
}
/**
* Link element metadata.
*/
final class LinkMetadata
{
public function __construct(
/** @var string The href URL value */
public readonly string $href,
/** @var string Link text content (normalized) */
public readonly string $text,
/** @var LinkType Link type classification */
public readonly LinkType $linkType,
/** @var array<string> Rel attribute values */
public readonly array $rel,
/** @var array<array<string>> Additional attributes as key-value pairs */
public readonly array $attributes,
/** @var ?string Optional title attribute */
public readonly ?string $title = null
) { }
}
/**
* Image element metadata.
*/
final class ImageMetadataType
{
public function __construct(
/** @var string Image source (URL, data URI, or SVG content) */
public readonly string $src,
/** @var ImageType Image type classification */
public readonly ImageType $imageType,
/** @var array<array<string>> Additional attributes as key-value pairs */
public readonly array $attributes,
/** @var ?string Alternative text from alt attribute */
public readonly ?string $alt = null,
/** @var ?string Title attribute */
public readonly ?string $title = null,
/** @var ?array<int> Image dimensions as (width, height) if available */
public readonly ?array $dimensions = null
) { }
}
/**
* Structured data (Schema.org, microdata, RDFa) block.
*/
final class StructuredData
{
public function __construct(
/** @var StructuredDataType Type of structured data */
public readonly StructuredDataType $dataType,
/** @var string Raw JSON string representation */
public readonly string $rawJson,
/** @var ?string Schema type if detectable (e.g., "Article", "Event", "Product") */
public readonly ?string $schemaType = null
) { }
}
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
final class HtmlMetadata
{
public function __construct(
/** @var array<string> Document keywords from `<meta name="keywords">` tag, split on commas */
public readonly array $keywords,
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*
* @var array<string, string>
*/
public readonly array $openGraph,
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*
* @var array<string, string>
*/
public readonly array $twitterCard,
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*
* @var array<string, string>
*/
public readonly array $metaTags,
/** @var array<HeaderMetadata> Extracted header elements with hierarchy */
public readonly array $headers,
/** @var array<LinkMetadata> Extracted hyperlinks with type classification */
public readonly array $links,
/** @var array<ImageMetadataType> Extracted images with source and dimensions */
public readonly array $images,
/** @var array<StructuredData> Extracted structured data blocks */
public readonly array $structuredData,
/** @var ?string Document title from `<title>` tag */
public readonly ?string $title = null,
/** @var ?string Document description from `<meta name="description">` tag */
public readonly ?string $description = null,
/** @var ?string Document author from `<meta name="author">` tag */
public readonly ?string $author = null,
/** @var ?string Canonical URL from `<link rel="canonical">` tag */
public readonly ?string $canonicalUrl = null,
/** @var ?string Base URL from `<base href="">` tag for resolving relative URLs */
public readonly ?string $baseHref = null,
/** @var ?string Document language from `lang` attribute */
public readonly ?string $language = null,
/** @var ?TextDirection Document text direction from `dir` attribute */
public readonly ?TextDirection $textDirection = null
) { }
}
/**
* OCR processing metadata.
*
* Captures information about OCR processing configuration and results.
*/
final class OcrMetadata
{
public function __construct(
/** @var string OCR language code(s) used */
public readonly string $language,
/** @var int Tesseract Page Segmentation Mode (PSM) */
public readonly int $psm,
/** @var string Output format (e.g., "text", "hocr") */
public readonly string $outputFormat,
/** @var int Number of tables detected */
public readonly int $tableCount,
/** @var ?int */
public readonly ?int $tableRows = null,
/** @var ?int */
public readonly ?int $tableCols = null
) { }
}
/**
* Error metadata (for batch operations).
*/
final class ErrorMetadata
{
public function __construct(
/** @var string */
public readonly string $errorType,
/** @var string */
public readonly string $message
) { }
}
/**
* PowerPoint presentation metadata.
*
* Extracted from PPTX files containing slide counts and presentation details.
*/
final class PptxMetadata
{
public function __construct(
/** @var int Total number of slides in the presentation */
public readonly int $slideCount,
/** @var array<string> Names of slides (if available) */
public readonly array $slideNames,
/** @var ?int Number of embedded images */
public readonly ?int $imageCount = null,
/** @var ?int Number of tables */
public readonly ?int $tableCount = null
) { }
}
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
final class DocxMetadata
{
public function __construct(
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*
* @var ?CoreProperties
*/
public readonly ?CoreProperties $coreProperties = null,
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*
* @var ?DocxAppProperties
*/
public readonly ?DocxAppProperties $appProperties = null,
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*
* @var ?array<string, string>
*/
public readonly ?array $customProperties = null
) { }
}
/**
* CSV/TSV file metadata.
*/
final class CsvMetadata
{
public function __construct(
/** @var int */
public readonly int $rowCount,
/** @var int */
public readonly int $columnCount,
/** @var bool */
public readonly bool $hasHeader,
/** @var ?string */
public readonly ?string $delimiter = null,
/** @var ?array<string> */
public readonly ?array $columnTypes = null
) { }
}
/**
* BibTeX bibliography metadata.
*/
final class BibtexMetadata
{
public function __construct(
/** @var int Number of entries in the bibliography. */
public readonly int $entryCount,
/** @var array<string> */
public readonly array $citationKeys,
/** @var array<string> */
public readonly array $authors,
/** @var ?YearRange */
public readonly ?YearRange $yearRange = null,
/** @var ?array<string, int> */
public readonly ?array $entryTypes = null
) { }
}
/**
* Citation file metadata (RIS, PubMed, EndNote).
*/
final class CitationMetadata
{
public function __construct(
/** @var int */
public readonly int $citationCount,
/** @var array<string> */
public readonly array $authors,
/** @var array<string> */
public readonly array $dois,
/** @var array<string> */
public readonly array $keywords,
/** @var ?string */
public readonly ?string $format = null,
/** @var ?YearRange */
public readonly ?YearRange $yearRange = null
) { }
}
/**
* Year range for bibliographic metadata.
*/
final class YearRange
{
public function __construct(
/** @var array<int> */
public readonly array $years,
/** @var ?int */
public readonly ?int $min = null,
/** @var ?int */
public readonly ?int $max = null
) { }
}
/**
* FictionBook (FB2) metadata.
*/
final class FictionBookMetadata
{
public function __construct(
/** @var array<string> */
public readonly array $genres,
/** @var array<string> */
public readonly array $sequences,
/** @var ?string */
public readonly ?string $annotation = null
) { }
}
/**
* dBASE (DBF) file metadata.
*/
final class DbfMetadata
{
public function __construct(
/** @var int */
public readonly int $recordCount,
/** @var int */
public readonly int $fieldCount,
/** @var array<DbfFieldInfo> */
public readonly array $fields
) { }
}
/**
* dBASE field information.
*/
final class DbfFieldInfo
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var string */
public readonly string $fieldType
) { }
}
/**
* JATS (Journal Article Tag Suite) metadata.
*/
final class JatsMetadata
{
public function __construct(
/** @var array<string, string> */
public readonly array $historyDates,
/** @var array<ContributorRole> */
public readonly array $contributorRoles,
/** @var ?string */
public readonly ?string $copyright = null,
/** @var ?string */
public readonly ?string $license = null
) { }
}
/**
* JATS contributor with role.
*/
final class ContributorRole
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var ?string */
public readonly ?string $role = null
) { }
}
/**
* EPUB metadata (Dublin Core extensions).
*/
final class EpubMetadata
{
public function __construct(
/** @var ?string */
public readonly ?string $coverage = null,
/** @var ?string */
public readonly ?string $dcFormat = null,
/** @var ?string */
public readonly ?string $relation = null,
/** @var ?string */
public readonly ?string $source = null,
/** @var ?string */
public readonly ?string $dcType = null,
/** @var ?string */
public readonly ?string $coverImage = null
) { }
}
/**
* Outlook PST archive metadata.
*/
final class PstMetadata
{
public function __construct(
/** @var int */
public readonly int $messageCount
) { }
}
/**
* Confidence scores for an OCR element.
*
* Separates detection confidence (how confident that text exists at this location)
* from recognition confidence (how confident about the actual text content).
*/
final class OcrConfidence
{
public function __construct(
/**
* Recognition confidence: how confident about the text content.
*
* Range: 0.0 to 1.0.
*
* @var float
*/
public readonly float $recognition,
/**
* Detection confidence: how confident the OCR engine is that text exists here.
*
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
* Range: 0.0 to 1.0 (or None if not available).
*
* @var ?float
*/
public readonly ?float $detection = null
) { }
}
/**
* Rotation information for an OCR element.
*/
final class OcrRotation
{
public function __construct(
/** @var float Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
public readonly float $angleDegrees,
/** @var ?float Confidence score for the rotation detection. */
public readonly ?float $confidence = null
) { }
}
/**
* A unified OCR element representing detected text with full metadata.
*
* This is the primary type for structured OCR output, preserving all information
* from both Tesseract and PaddleOCR backends.
*/
final class OcrElement
{
public function __construct(
/** @var string The recognized text content. */
public readonly string $text,
/** @var OcrBoundingGeometry Bounding geometry (rectangle or quadrilateral). */
public readonly OcrBoundingGeometry $geometry,
/** @var OcrConfidence Confidence scores for detection and recognition. */
public readonly OcrConfidence $confidence,
/** @var OcrElementLevel Hierarchical level (word, line, block, page). */
public readonly OcrElementLevel $level,
/** @var int Page number (1-indexed). */
public readonly int $pageNumber,
/** @var array<string, string> Backend-specific metadata that doesn't fit the unified schema. */
public readonly array $backendMetadata,
/** @var ?OcrRotation Rotation information (if detected). */
public readonly ?OcrRotation $rotation = null,
/**
* Parent element ID for hierarchical relationships.
*
* Only used for Tesseract output which has word -> line -> block hierarchy.
*
* @var ?string
*/
public readonly ?string $parentId = null
) { }
}
/**
* Configuration for OCR element extraction.
*
* Controls how OCR elements are extracted and filtered.
*/
final class OcrElementConfig
{
public function __construct(
/**
* Whether to include OCR elements in the extraction result.
*
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
*
* @var bool
*/
public readonly bool $includeElements,
/**
* Minimum hierarchical level to include.
*
* Elements below this level (e.g., words when min_level is Line) will be excluded.
*
* @var OcrElementLevel
*/
public readonly OcrElementLevel $minLevel,
/**
* Minimum recognition confidence threshold (0.0-1.0).
*
* Elements with confidence below this threshold will be filtered out.
*
* @var float
*/
public readonly float $minConfidence,
/**
* Whether to build hierarchical relationships between elements.
*
* When true, `parent_id` fields will be populated based on spatial containment.
* Only meaningful for Tesseract output.
*
* @var bool
*/
public readonly bool $buildHierarchy
) { }
}
/**
* Unified page structure for documents.
*
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
* with character offset boundaries for chunk-to-page mapping.
*/
final class PageStructure
{
public function __construct(
/** @var int Total number of pages/slides/sheets */
public readonly int $totalCount,
/** @var PageUnitType Type of paginated unit */
public readonly PageUnitType $unitType,
/**
* Character offset boundaries for each page
*
* Maps character ranges in the extracted content to page numbers.
* Used for chunk page range calculation.
*
* @var ?array<PageBoundary>
*/
public readonly ?array $boundaries = null,
/** @var ?array<PageInfo> Detailed per-page metadata (optional, only when needed) */
public readonly ?array $pages = null
) { }
}
/**
* Byte offset boundary for a page.
*
* Tracks where a specific page's content starts and ends in the main content string,
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
*/
final class PageBoundary
{
public function __construct(
/** @var int Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
public readonly int $byteStart,
/** @var int Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
public readonly int $byteEnd,
/** @var int Page number (1-indexed) */
public readonly int $pageNumber
) { }
}
/**
* Metadata for individual page/slide/sheet.
*
* Captures per-page information including dimensions, content counts,
* and visibility state (for presentations).
*/
final class PageInfo
{
public function __construct(
/** @var int Page number (1-indexed) */
public readonly int $number,
/**
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
*
* Indicates the presence of vector-drawn content such as charts, diagrams,
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
* invisible to `ExtractionResult.images` since they are not embedded as raster
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
* signaling that downstream consumers may want to rasterize the page to
* capture this content.
*
* Only populated for PDFs; `None` for other document types.
*
* @var bool
*/
public readonly bool $hasVectorGraphics,
/** @var ?string Page title (usually for presentations) */
public readonly ?string $title = null,
/** @var ?array<float> Dimensions in points (PDF) or pixels (images): (width, height) */
public readonly ?array $dimensions = null,
/** @var ?int Number of images on this page */
public readonly ?int $imageCount = null,
/** @var ?int Number of tables on this page */
public readonly ?int $tableCount = null,
/** @var ?bool Whether this page is hidden (e.g., in presentations) */
public readonly ?bool $hidden = null,
/**
* Whether this page is blank (no meaningful text, no images, no tables)
*
* A page is considered blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images. This is useful for filtering out empty pages
* in scanned documents or PDFs with blank separator pages.
*
* @var ?bool
*/
public readonly ?bool $isBlank = null
) { }
}
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
final class PageContent
{
public function __construct(
/** @var int Page number (1-indexed) */
public readonly int $pageNumber,
/** @var string Text content for this page */
public readonly string $content,
/**
* Tables found on this page (uses Arc for memory efficiency)
*
* Serializes as Vec<Table> for JSON compatibility while maintaining
* Arc semantics in-memory for zero-copy sharing.
*
* @var array<Table>
*/
public readonly array $tables,
/**
* Indices into `ExtractionResult.images` for images found on this page.
*
* Each value is a zero-based index into the top-level `images` collection.
* Only populated when `extract_images = true` in the extraction config.
*
* @var array<int>
*/
public readonly array $imageIndices,
/**
* Hierarchy information for the page (when hierarchy extraction is enabled)
*
* Contains text hierarchy levels (H1-H6) extracted from the page content.
*
* @var ?PageHierarchy
*/
public readonly ?PageHierarchy $hierarchy = null,
/**
* Whether this page is blank (no meaningful text content)
*
* Determined during extraction based on text content analysis.
* A page is blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images.
*
* @var ?bool
*/
public readonly ?bool $isBlank = null,
/**
* Layout detection regions for this page (when layout detection is enabled).
*
* Contains detected layout regions with class, confidence, bounding box,
* and area fraction. Only populated when layout detection is configured.
*
* @var ?array<LayoutRegion>
*/
public readonly ?array $layoutRegions = null,
/**
* Speaker notes for this slide (PPTX only).
*
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
* Only populated when the source is a PPTX file and notes are present.
*
* @var ?string
*/
public readonly ?string $speakerNotes = null,
/**
* Section name this slide belongs to (PPTX only).
*
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
* the slide belongs to a named section.
*
* @var ?string
*/
public readonly ?string $sectionName = null,
/**
* Sheet name for this page (XLSX/ODS only).
*
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
* formats and for sheets with an empty name.
*
* @var ?string
*/
public readonly ?string $sheetName = null
) { }
}
/**
* A detected layout region on a page.
*
* When layout detection is enabled, each page may have layout regions
* identifying different content types (text, pictures, tables, etc.)
* with confidence scores and spatial positions.
*/
final class LayoutRegion
{
public function __construct(
/** @var string Layout class name (e.g. "picture", "table", "text", "section_header"). */
public readonly string $className,
/** @var float Confidence score from the layout detection model (0.0 to 1.0). */
public readonly float $confidence,
/** @var BoundingBox Bounding box in document coordinate space. */
public readonly BoundingBox $boundingBox,
/** @var float Fraction of the page area covered by this region (0.0 to 1.0). */
public readonly float $areaFraction
) { }
}
/**
* Page hierarchy structure containing heading levels and block information.
*
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
* blocks with heading levels (H1-H6) for semantic document structure.
*/
final class PageHierarchy
{
public function __construct(
/** @var int Number of hierarchy blocks on this page */
public readonly int $blockCount,
/** @var array<HierarchicalBlock> Hierarchical blocks with heading levels */
public readonly array $blocks
) { }
}
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
final class HierarchicalBlock
{
public function __construct(
/** @var string The text content of this block */
public readonly string $text,
/** @var float The font size of the text in this block */
public readonly float $fontSize,
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*
* @var string
*/
public readonly string $level,
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*
* @var ?array<float>
*/
public readonly ?array $bbox = null
) { }
}
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
final class CellChange
{
public function __construct(
/** @var int Zero-based row index. */
public readonly int $row,
/** @var int Zero-based column index. */
public readonly int $col,
/** @var string Value before the change. */
public readonly string $from,
/** @var string Value after the change. */
public readonly string $to
) { }
}
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, ). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
final class DocumentRevision
{
public function __construct(
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, ).
*
* @var string
*/
public readonly string $revisionId,
/** @var RevisionKind Semantic kind of this revision. */
public readonly RevisionKind $kind,
/** @var RevisionDelta The content changes that make up this revision. */
public readonly RevisionDelta $delta,
/** @var ?string Display name of the author who made this change, when available. */
public readonly ?string $author = null,
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*
* @var ?string
*/
public readonly ?string $timestamp = null,
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `None` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*
* @var ?RevisionAnchor
*/
public readonly ?RevisionAnchor $anchor = null
) { }
}
/**
* The content changes that make up a single revision.
*
* For insertions and deletions the `content` field carries the added/removed
* lines as `DiffLine.Added` / `DiffLine.Removed` entries. For format
* changes, `content` is empty the property diff is left as a TODO for a
* later enrichment pass.
*/
final class RevisionDelta
{
public function __construct(
/** @var array<DiffLine> Line-level content changes for this revision. */
public readonly array $content,
/** @var array<CellChange> Cell-level table changes for this revision. */
public readonly array $tableChanges
) { }
}
/**
* Extracted table structure.
*
* Represents a table detected and extracted from a document (PDF, image, etc.).
* Tables are converted to both structured cell data and Markdown format.
*/
final class Table
{
public function __construct(
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
public readonly array $cells,
/** @var string Markdown representation of the table */
public readonly string $markdown,
/** @var int Page number where the table was found (1-indexed) */
public readonly int $pageNumber,
/**
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted tables when position data is available.
*
* @var ?BoundingBox
*/
public readonly ?BoundingBox $boundingBox = null
) { }
}
/**
* Individual table cell with content and optional styling.
*
* Future extension point for rich table support with cell-level metadata.
*/
final class TableCell
{
public function __construct(
/** @var string Cell content as text */
public readonly string $content,
/** @var int Row span (number of rows this cell spans) */
public readonly int $rowSpan,
/** @var int Column span (number of columns this cell spans) */
public readonly int $colSpan,
/** @var bool Whether this is a header cell */
public readonly bool $isHeader
) { }
}
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
final class ExtractedUri
{
public function __construct(
/** @var string The URL or path string. */
public readonly string $url,
/** @var UriKind Semantic classification of the URI. */
public readonly UriKind $kind,
/** @var ?string Optional display text / label for the link. */
public readonly ?string $label = null,
/** @var ?int Optional page number where the URI was found (1-indexed). */
public readonly ?int $page = null
) { }
}
/**
* MIME type detection response.
*/
final class DetectResponse
{
public function __construct(
/** @var string Detected MIME type */
public readonly string $mimeType,
/** @var ?string Original filename (if provided) */
public readonly ?string $filename = null
) { }
}
/**
* Options controlling how two `ExtractionResult` values are compared.
*/
final class DiffOptions
{
public function __construct(
/** @var bool Include metadata changes in the diff. Default: `true`. */
public readonly bool $includeMetadata,
/** @var bool Include embedded-children changes in the diff. Default: `true`. */
public readonly bool $includeEmbedded,
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `None` means no truncation.
*
* @var ?int
*/
public readonly ?int $maxContentChars = null
) { }
public static function default(): DiffOptions
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* The complete diff between two `ExtractionResult` values.
*/
final class ExtractionDiff
{
public function __construct(
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*
* @var array<DiffHunk>
*/
public readonly array $contentDiff,
/** @var array<Table> Tables present in `b` but not in `a` (by index position, excess right-side tables). */
public readonly array $tablesAdded,
/** @var array<Table> Tables present in `a` but not in `b` (by index position, excess left-side tables). */
public readonly array $tablesRemoved,
/** @var array<TableDiff> Cell-level changes for table pairs that share the same index and dimensions. */
public readonly array $tablesChanged,
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*
* @var string
*/
public readonly string $metadataChanged,
/** @var EmbeddedChanges Changes to embedded archive children. */
public readonly EmbeddedChanges $embeddedChanges
) { }
}
/**
* A single contiguous hunk in a unified diff.
*/
final class DiffHunk
{
public function __construct(
/** @var int Starting line number in the old content (0-indexed). */
public readonly int $fromLine,
/** @var int Number of lines from the old content in this hunk. */
public readonly int $fromCount,
/** @var int Starting line number in the new content (0-indexed). */
public readonly int $toLine,
/** @var int Number of lines from the new content in this hunk. */
public readonly int $toCount,
/** @var array<DiffLine> Lines that make up this hunk. */
public readonly array $lines
) { }
}
/**
* Cell-level changes for a pair of tables that share the same index.
*/
final class TableDiff
{
public function __construct(
/** @var int Zero-based index of the table in both `a.tables` and `b.tables`. */
public readonly int $fromIndex,
/** @var int Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
public readonly int $toIndex,
/** @var array<CellChange> Cell-level changes within the table. */
public readonly array $cellChanges
) { }
}
/**
* Changes to embedded archive children between two results.
*/
final class EmbeddedChanges
{
public function __construct(
/** @var array<ArchiveEntry> Children present in `b` but not in `a` (matched by `path`). */
public readonly array $added,
/** @var array<ArchiveEntry> Children present in `a` but not in `b` (matched by `path`). */
public readonly array $removed,
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*
* @var array<EmbeddedDiff>
*/
public readonly array $changed
) { }
}
/**
* Diff for a single embedded archive entry that appears in both results.
*/
final class EmbeddedDiff
{
public function __construct(
/** @var string Archive-relative path identifying this entry. */
public readonly string $path,
/** @var ExtractionDiff The recursive diff of the entry's extraction result. */
public readonly ExtractionDiff $diff
) { }
}
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility instances
* are safe to clone and pass across language boundaries.
*/
final class EmbeddingPreset
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var int */
public readonly int $chunkSize,
/** @var int */
public readonly int $overlap,
/** @var string HuggingFace repository name for the model. */
public readonly string $modelRepo,
/** @var string Pooling strategy: "cls" or "mean". */
public readonly string $pooling,
/** @var string Path to the ONNX model file within the repo. */
public readonly string $modelFile,
/** @var int */
public readonly int $dimensions,
/** @var string */
public readonly string $description
) { }
}
/**
* YAKE-specific parameters.
*/
final class YakeParams
{
public function __construct(
/**
* Window size for co-occurrence analysis (default: 2).
*
* Controls the context window for computing co-occurrence statistics.
*
* @var int
*/
public readonly int $windowSize
) { }
public static function default(): YakeParams
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* RAKE-specific parameters.
*/
final class RakeParams
{
public function __construct(
/** @var int Minimum word length to consider (default: 1). */
public readonly int $minWordLength,
/** @var int Maximum words in a keyword phrase (default: 3). */
public readonly int $maxWordsPerPhrase
) { }
public static function default(): RakeParams
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Keyword extraction configuration.
*/
final class KeywordConfig
{
public function __construct(
/** @var KeywordAlgorithm Algorithm to use for extraction. */
public readonly KeywordAlgorithm $algorithm,
/** @var int Maximum number of keywords to extract (default: 10). */
public readonly int $maxKeywords,
/**
* Minimum score threshold (0.0-1.0, default: 0.0).
*
* Keywords with scores below this threshold are filtered out.
* Note: Score ranges differ between algorithms.
*
* @var float
*/
public readonly float $minScore,
/**
* N-gram range for keyword extraction (min, max).
*
* (1, 1) = unigrams only
* (1, 2) = unigrams and bigrams
* (1, 3) = unigrams, bigrams, and trigrams (default)
*
* @var array<int>
*/
public readonly array $ngramRange,
/**
* Language code for stopword filtering (e.g., "en", "de", "fr").
*
* If None, no stopword filtering is applied.
*
* @var ?string
*/
public readonly ?string $language = null,
/** @var ?YakeParams YAKE-specific tuning parameters. */
public readonly ?YakeParams $yakeParams = null,
/** @var ?RakeParams RAKE-specific tuning parameters. */
public readonly ?RakeParams $rakeParams = null
) { }
public static function default(): KeywordConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Extracted keyword with metadata.
*/
final class Keyword
{
public function __construct(
/** @var string The keyword text. */
public readonly string $text,
/** @var float Relevance score (higher is better, algorithm-specific range). */
public readonly float $score,
/** @var KeywordAlgorithm Algorithm that extracted this keyword. */
public readonly KeywordAlgorithm $algorithm,
/** @var ?array<int> Optional positions where keyword appears in text (character offsets). */
public readonly ?array $positions = null
) { }
}
/**
* Configuration for PaddleOCR backend.
*
* Configures PaddleOCR text detection and recognition with multi-language support.
* Uses a builder pattern for convenient configuration.
*
* # Examples
*/
final class PaddleOcrConfig
{
public function __construct(
/** @var string Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
public readonly string $language,
/**
* Enable angle classification for rotated text (default: false).
* Can misfire on short text regions, rotating crops incorrectly before recognition.
*
* @var bool
*/
public readonly bool $useAngleCls,
/** @var bool Enable table structure detection (default: false) */
public readonly bool $enableTableDetection,
/**
* Database threshold for text detection (default: 0.3)
* Range: 0.0-1.0, higher values require more confident detections
*
* @var float
*/
public readonly float $detDbThresh,
/**
* Box threshold for text bounding box refinement (default: 0.5)
* Range: 0.0-1.0
*
* @var float
*/
public readonly float $detDbBoxThresh,
/**
* Unclip ratio for expanding text bounding boxes (default: 1.6)
* Controls the expansion of detected text regions
*
* @var float
*/
public readonly float $detDbUnclipRatio,
/**
* Maximum side length for detection image (default: 960)
* Larger images may be resized to this limit for faster inference
*
* @var int
*/
public readonly int $detLimitSideLen,
/**
* Batch size for recognition inference (default: 6)
* Number of text regions to process simultaneously
*
* @var int
*/
public readonly int $recBatchNum,
/**
* Padding in pixels added around the image before detection (default: 10).
* Large values can include surrounding content like table gridlines.
*
* @var int
*/
public readonly int $padding,
/**
* Minimum recognition confidence score for text lines (default: 0.5).
* Text regions with recognition confidence below this threshold are discarded.
* Matches PaddleOCR Python's `drop_score` parameter.
* Range: 0.0-1.0
*
* @var float
*/
public readonly float $dropScore,
/**
* Model tier controlling detection/recognition model size and accuracy trade-off.
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
*
* @var string
*/
public readonly string $modelTier,
/** @var ?string Optional custom cache directory for model files */
public readonly ?string $cacheDir = null
) { }
public function withCacheDir(string $path): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withTableDetection(bool $enable): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withAngleCls(bool $enable): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbThresh(float $threshold): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbBoxThresh(float $threshold): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbUnclipRatio(float $ratio): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetLimitSideLen(int $length): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withRecBatchNum(int $batch_size): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDropScore(float $score): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withPadding(int $padding): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withModelTier(string $tier): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public static function default(): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Combined paths to all models needed for OCR (backward compatibility).
*/
final class ModelPaths
{
public function __construct(
/** @var string Path to the detection model directory. */
public readonly string $detModel,
/** @var string Path to the classification model directory. */
public readonly string $clsModel,
/** @var string Path to the recognition model directory. */
public readonly string $recModel,
/** @var string Path to the character dictionary file. */
public readonly string $dictFile
) { }
}
/**
* Document orientation detection result.
*/
final class OrientationResult
{
public function __construct(
/** @var int Detected orientation in degrees (0, 90, 180, or 270). */
public readonly int $degrees,
/** @var float Confidence score (0.0-1.0). */
public readonly float $confidence
) { }
}
/**
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
*/
final class BBox
{
public function __construct(
/** @var float */
public readonly float $x1,
/** @var float */
public readonly float $y1,
/** @var float */
public readonly float $x2,
/** @var float */
public readonly float $y2
) { }
}
/**
* A single layout detection result.
*/
final class LayoutDetection
{
public function __construct(
/** @var LayoutClass */
public readonly LayoutClass $className,
/** @var float */
public readonly float $confidence,
/** @var BBox */
public readonly BBox $bbox
) { }
}
/**
* Pre-computed table markdown for a table detection region.
*
* Produced by the TATR-based table structure recognizer and surfaced as part of
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
* so that consumers who do not enable `layout-detection` (ORT) can still reference
* the type in their own code.
*/
final class RecognizedTable
{
public function __construct(
/** @var BBox Detection bbox that this table corresponds to (for matching). */
public readonly BBox $detectionBbox,
/** @var array<array<string>> Table cells as a 2D vector (rows × columns). */
public readonly array $cells,
/** @var string Rendered markdown table. */
public readonly string $markdown
) { }
}
/**
* Page-level detection result containing all detections and page metadata.
*/
final class DetectionResult
{
public function __construct(
/** @var int */
public readonly int $pageWidth,
/** @var int */
public readonly int $pageHeight,
/** @var array<LayoutDetection> */
public readonly array $detections
) { }
}
/**
* Embedded file descriptor extracted from the PDF name tree.
*/
final class EmbeddedFile
{
public function __construct(
/** @var string The filename as stored in the PDF name tree. */
public readonly string $name,
/** @var string Raw file bytes from the embedded stream (already decompressed by lopdf). */
public readonly string $data,
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*
* @var int
*/
public readonly int $compressedSize,
/** @var ?string MIME type if specified in the filespec, otherwise `None`. */
public readonly ?string $mimeType = null
) { }
}
/**
* PDF-specific metadata.
*
* Contains metadata fields specific to PDF documents that are not in the common
* `Metadata` structure. Common fields like title, authors, keywords, and dates
* are at the `Metadata` level.
*/
final class PdfMetadata
{
public function __construct(
/** @var ?string PDF version (e.g., "1.7", "2.0") */
public readonly ?string $pdfVersion = null,
/** @var ?string PDF producer (application that created the PDF) */
public readonly ?string $producer = null,
/** @var ?bool Whether the PDF is encrypted/password-protected */
public readonly ?bool $isEncrypted = null,
/** @var ?int First page width in points (1/72 inch) */
public readonly ?int $width = null,
/** @var ?int First page height in points (1/72 inch) */
public readonly ?int $height = null,
/** @var ?int Total number of pages in the PDF document */
public readonly ?int $pageCount = null
) { }
}
enum ExecutionProviderType: string
{
case Auto = 'Auto';
case Cpu = 'Cpu';
case CoreMl = 'CoreMl';
case Cuda = 'Cuda';
case TensorRt = 'TensorRt';
}
enum OutputFormat: string
{
case Plain = 'Plain';
case Markdown = 'Markdown';
case Djot = 'Djot';
case Html = 'Html';
case Json = 'Json';
case Structured = 'Structured';
case Custom = 'Custom';
}
enum HtmlTheme: string
{
case Default = 'Default';
case GitHub = 'GitHub';
case Dark = 'Dark';
case Light = 'Light';
case Unstyled = 'Unstyled';
}
enum TableModel: string
{
case Tatr = 'tatr';
case SlanetWired = 'slanet_wired';
case SlanetWireless = 'slanet_wireless';
case SlanetPlus = 'slanet_plus';
case SlanetAuto = 'slanet_auto';
case Disabled = 'disabled';
}
enum ChunkerType: string
{
case Text = 'Text';
case Markdown = 'Markdown';
case Yaml = 'Yaml';
case Semantic = 'Semantic';
}
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
final class ChunkSizing
{
}
/**
* Embedding model types supported by Kreuzberg.
*/
final class EmbeddingModelType
{
}
enum CodeContentMode: string
{
case Chunks = 'chunks';
case Raw = 'raw';
case Structure = 'structure';
}
enum ListType: string
{
case Bullet = 'Bullet';
case Numbered = 'Numbered';
case Lettered = 'Lettered';
case Indented = 'Indented';
}
enum OcrBackendType: string
{
case Tesseract = 'Tesseract';
case EasyOCR = 'EasyOCR';
case PaddleOCR = 'PaddleOCR';
case Custom = 'Custom';
}
enum ProcessingStage: string
{
case Early = 'Early';
case Middle = 'Middle';
case Late = 'Late';
}
enum ReductionLevel: string
{
case Off = 'Off';
case Light = 'Light';
case Moderate = 'Moderate';
case Aggressive = 'Aggressive';
case Maximum = 'Maximum';
}
enum PdfAnnotationType: string
{
case Text = 'text';
case Highlight = 'highlight';
case Link = 'link';
case Stamp = 'stamp';
case Underline = 'underline';
case StrikeOut = 'strike_out';
case Other = 'other';
}
enum BlockType: string
{
case Paragraph = 'paragraph';
case Heading = 'heading';
case Blockquote = 'blockquote';
case CodeBlock = 'code_block';
case ListItem = 'list_item';
case OrderedList = 'ordered_list';
case BulletList = 'bullet_list';
case TaskList = 'task_list';
case DefinitionList = 'definition_list';
case DefinitionTerm = 'definition_term';
case DefinitionDescription = 'definition_description';
case Div = 'div';
case Section = 'section';
case ThematicBreak = 'thematic_break';
case RawBlock = 'raw_block';
case MathDisplay = 'math_display';
}
enum InlineType: string
{
case Text = 'text';
case Strong = 'strong';
case Emphasis = 'emphasis';
case Highlight = 'highlight';
case Subscript = 'subscript';
case Superscript = 'superscript';
case Insert = 'insert';
case Delete = 'delete';
case Code = 'code';
case Link = 'link';
case Image = 'image';
case Span = 'span';
case Math = 'math';
case RawInline = 'raw_inline';
case FootnoteRef = 'footnote_ref';
case Symbol = 'symbol';
}
enum RelationshipKind: string
{
case FootnoteReference = 'footnote_reference';
case CitationReference = 'citation_reference';
case InternalLink = 'internal_link';
case Caption = 'caption';
case Label = 'label';
case TocEntry = 'toc_entry';
case CrossReference = 'cross_reference';
}
enum ContentLayer: string
{
case Body = 'body';
case Header = 'header';
case Footer = 'footer';
case Footnote = 'footnote';
}
/**
* Tagged enum for node content. Each variant carries only type-specific data.
*
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
* Go/Java/TypeScript bindings.
*/
final class NodeContent
{
}
/**
* Types of inline text annotations.
*/
final class AnnotationKind
{
}
enum ExtractionMethod: string
{
case Native = 'native';
case Ocr = 'ocr';
case Mixed = 'mixed';
}
enum ChunkType: string
{
case Heading = 'heading';
case PartyList = 'party_list';
case Definitions = 'definitions';
case OperativeClause = 'operative_clause';
case SignatureBlock = 'signature_block';
case Schedule = 'schedule';
case TableLike = 'table_like';
case Formula = 'formula';
case CodeBlock = 'code_block';
case Image = 'image';
case OrgChart = 'org_chart';
case Diagram = 'diagram';
case Unknown = 'unknown';
}
enum ImageKind: string
{
case Photograph = 'photograph';
case Diagram = 'diagram';
case Chart = 'chart';
case Drawing = 'drawing';
case TextBlock = 'text_block';
case Decoration = 'decoration';
case Logo = 'logo';
case Icon = 'icon';
case TileFragment = 'tile_fragment';
case Mask = 'mask';
case PageRaster = 'page_raster';
case Unknown = 'unknown';
}
enum ResultFormat: string
{
case Unified = 'unified';
case ElementBased = 'element_based';
}
enum ElementType: string
{
case Title = 'title';
case NarrativeText = 'narrative_text';
case Heading = 'heading';
case ListItem = 'list_item';
case Table = 'table';
case Image = 'image';
case PageBreak = 'page_break';
case CodeBlock = 'code_block';
case BlockQuote = 'block_quote';
case Footer = 'footer';
case Header = 'header';
}
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
final class FormatMetadata
{
}
enum TextDirection: string
{
case LeftToRight = 'ltr';
case RightToLeft = 'rtl';
case Auto = 'auto';
}
enum LinkType: string
{
case Anchor = 'Anchor';
case Internal = 'Internal';
case External = 'External';
case Email = 'Email';
case Phone = 'Phone';
case Other = 'Other';
}
enum ImageType: string
{
case DataUri = 'data-uri';
case InlineSvg = 'inline-svg';
case External = 'External';
case Relative = 'Relative';
}
enum StructuredDataType: string
{
case JsonLd = 'json-ld';
case Microdata = 'Microdata';
case RDFa = 'rdfa';
}
/**
* Bounding geometry for an OCR element.
*
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
* (from PaddleOCR and rotated text detection).
*/
final class OcrBoundingGeometry
{
}
enum OcrElementLevel: string
{
case Word = 'word';
case Line = 'line';
case Block = 'block';
case Page = 'page';
}
enum PageUnitType: string
{
case Page = 'page';
case Slide = 'slide';
case Sheet = 'sheet';
}
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
final class DiffLine
{
}
enum RevisionKind: string
{
case Insertion = 'insertion';
case Deletion = 'deletion';
case FormatChange = 'format_change';
case Comment = 'comment';
}
/**
* Best-effort document location for a revision.
*/
final class RevisionAnchor
{
}
enum UriKind: string
{
case Hyperlink = 'hyperlink';
case Image = 'image';
case Anchor = 'anchor';
case Citation = 'citation';
case Reference = 'reference';
case Email = 'email';
}
enum KeywordAlgorithm: string
{
case Yake = 'Yake';
case Rake = 'Rake';
}
enum PSMMode: string
{
case OsdOnly = 'OsdOnly';
case AutoOsd = 'AutoOsd';
case AutoOnly = 'AutoOnly';
case Auto = 'Auto';
case SingleColumn = 'SingleColumn';
case SingleBlockVertical = 'SingleBlockVertical';
case SingleBlock = 'SingleBlock';
case SingleLine = 'SingleLine';
case SingleWord = 'SingleWord';
case CircleWord = 'CircleWord';
case SingleChar = 'SingleChar';
}
enum PaddleLanguage: string
{
case English = 'English';
case Chinese = 'Chinese';
case Japanese = 'Japanese';
case Korean = 'Korean';
case German = 'German';
case French = 'French';
case Latin = 'Latin';
case Cyrillic = 'Cyrillic';
case TraditionalChinese = 'TraditionalChinese';
case Thai = 'Thai';
case Greek = 'Greek';
case EastSlavic = 'EastSlavic';
case Arabic = 'Arabic';
case Devanagari = 'Devanagari';
case Tamil = 'Tamil';
case Telugu = 'Telugu';
}
enum LayoutClass: string
{
case Caption = 'caption';
case Footnote = 'footnote';
case Formula = 'formula';
case ListItem = 'list_item';
case PageFooter = 'page_footer';
case PageHeader = 'page_header';
case Picture = 'picture';
case SectionHeader = 'section_header';
case Table = 'table';
case Text = 'text';
case Title = 'title';
case DocumentIndex = 'document_index';
case Code = 'code';
case CheckboxSelected = 'checkbox_selected';
case CheckboxUnselected = 'checkbox_unselected';
case Form = 'form';
case KeyValueRegion = 'key_value_region';
}
class KreuzbergApi
{
public static function extractBytes(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractFile(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractFileSync(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractBytesSync(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchFileItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractFilesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchBytesItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractBytesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchFileItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractFiles(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchBytesItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractBytes(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function detectMimeTypeFromBytes(string $content): string { throw new \RuntimeException('Not implemented.'); }
/**
* @param string $mime_type
* @return array<string>
*/
public static function getExtensionsForMime(string $mime_type): array { throw new \RuntimeException('Not implemented.'); }
public static function clearEmbeddingBackends(): void { }
/**
* @return array<string>
*/
public static function listEmbeddingBackends(): array { throw new \RuntimeException('Not implemented.'); }
/**
* @return array<string>
*/
public static function listDocumentExtractors(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearDocumentExtractors(): void { }
/**
* @return array<string>
*/
public static function listOcrBackends(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearOcrBackends(): void { }
/**
* @return array<string>
*/
public static function listPostProcessors(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearPostProcessors(): void { }
/**
* @return array<string>
*/
public static function listRenderers(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearRenderers(): void { }
/**
* @return array<string>
*/
public static function listValidators(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearValidators(): void { }
public static function compare(\Kreuzberg\ExtractionResult $a, \Kreuzberg\ExtractionResult $b, \Kreuzberg\DiffOptions $opts): \Kreuzberg\ExtractionDiff { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<string> $texts
* @param \Kreuzberg\EmbeddingConfig $config
* @return array<array<float>>
*/
public static function embedTextsAsync(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function renderPdfPageToPng(string $pdf_bytes, int $page_index, ?int $dpi = null, ?string $password = null): string { throw new \RuntimeException('Not implemented.'); }
public static function detectMimeType(string $path, bool $check_exists): string { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<string> $texts
* @param \Kreuzberg\EmbeddingConfig $config
* @return array<array<float>>
*/
public static function embedTexts(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function getEmbeddingPreset(string $name): ?\Kreuzberg\EmbeddingPreset { throw new \RuntimeException('Not implemented.'); }
/**
* @return array<string>
*/
public static function listEmbeddingPresets(): array { throw new \RuntimeException('Not implemented.'); }
public static function registerOcrBackend(\Kreuzberg\OcrBackend $backend): void { }
public static function unregisterOcrBackend(string $name): void { }
public static function clearOcrBackends(): void { }
public static function registerPostProcessor(\Kreuzberg\PostProcessor $backend): void { }
public static function unregisterPostProcessor(string $name): void { }
public static function clearPostProcessors(): void { }
public static function registerValidator(\Kreuzberg\Validator $backend): void { }
public static function unregisterValidator(string $name): void { }
public static function clearValidators(): void { }
public static function registerEmbeddingBackend(\Kreuzberg\EmbeddingBackend $backend): void { }
public static function unregisterEmbeddingBackend(string $name): void { }
public static function clearEmbeddingBackends(): void { }
public static function registerDocumentExtractor(\Kreuzberg\DocumentExtractor $backend): void { }
public static function unregisterDocumentExtractor(string $name): void { }
public static function clearDocumentExtractors(): void { }
public static function registerRenderer(\Kreuzberg\Renderer $backend): void { }
public static function unregisterRenderer(string $name): void { }
public static function clearRenderers(): void { }
}
} // end namespace