Files
fil/packages/php/stubs/kreuzberg_extension.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

5397 lines
199 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
// This file is auto-generated by alef — DO NOT EDIT.
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
// To regenerate: alef generate
// To verify freshness: alef verify --exit-code
// Issues & docs: https://github.com/kreuzberg-dev/alef
// Type stubs for the native PHP extension — declares classes
// provided at runtime by the compiled Rust extension (.so/.dll).
// Include this in phpstan.neon scanFiles for static analysis.
declare(strict_types=1);
namespace Kreuzberg {
class KreuzbergException extends \RuntimeException
{
public function getErrorCode(): int { throw new \RuntimeException('Not implemented.'); }
}
final class CacheStats
{
public function __construct(
/** @var int */
public readonly int $totalFiles,
/** @var float */
public readonly float $totalSizeMb,
/** @var float */
public readonly float $availableSpaceMb,
/** @var float */
public readonly float $oldestFileAgeDays,
/** @var float */
public readonly float $newestFileAgeDays
) { }
}
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
* for inference in layout detection and embedding generation.
*
* # Example
*/
final class AccelerationConfig
{
public function __construct(
/** @var ExecutionProviderType Execution provider to use for ONNX inference. */
public readonly ExecutionProviderType $provider,
/** @var int GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
public readonly int $deviceId
) { }
}
/**
* Cross-extractor content filtering configuration.
*
* Controls whether "furniture" content (headers, footers, page numbers,
* watermarks, repeating text) is included in or stripped from extraction
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
* with format-specific implementation.
*
* When `None` on `ExtractionConfig`, each extractor uses its current
* default behavior unchanged.
*/
final class ContentFilterConfig
{
public function __construct(
/**
* Include running headers in extraction output.
*
* - PDF: Disables top-margin furniture stripping and prevents the layout
* model from treating `PageHeader`-classified regions as furniture.
* - DOCX: Includes document headers in text output.
* - RTF/ODT: Headers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<header>` element content.
*
* Default: `false` (headers are stripped or excluded).
*
* @var bool
*/
public readonly bool $includeHeaders,
/**
* Include running footers in extraction output.
*
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
* model from treating `PageFooter`-classified regions as furniture.
* - DOCX: Includes document footers in text output.
* - RTF/ODT: Footers already included; this is a no-op when true.
* - HTML/EPUB: Keeps `<footer>` element content.
*
* Default: `false` (footers are stripped or excluded).
*
* @var bool
*/
public readonly bool $includeFooters,
/**
* Enable the heuristic cross-page repeating text detector.
*
* When `true` (default), text that repeats verbatim across a supermajority
* of pages is classified as furniture and stripped. Disable this if brand
* names or repeated headings are being incorrectly removed by the heuristic.
*
* Note: when a layout-detection model is active, the model may independently
* classify page-header / page-footer regions as furniture on a per-page basis.
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
* or both, in addition to disabling this flag.
*
* Primarily affects PDF extraction.
*
* Default: `true`.
*
* @var bool
*/
public readonly bool $stripRepeatingText,
/**
* Include watermark text in extraction output.
*
* - PDF: Keeps watermark artifacts and arXiv identifiers.
* - Other formats: No effect currently.
*
* Default: `false` (watermarks are stripped).
*
* @var bool
*/
public readonly bool $includeWatermarks
) { }
public static function default(): ContentFilterConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for email extraction.
*/
final class EmailConfig
{
public function __construct(
/**
* Windows codepage number to use when an MSG file contains no codepage property.
* Defaults to `None`, which falls back to windows-1252.
*
* If an unrecognized or invalid codepage number is supplied (including 0),
* the behavior silently falls back to windows-1252 — the same as when the
* MSG file itself contains an unrecognized codepage. No error or warning is
* emitted. Users should verify output when supplying unusual values.
*
* Common values:
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
* - 1252: Western European (default)
* - 1253: Greek
* - 1254: Turkish
* - 1255: Hebrew
* - 1256: Arabic
* - 932: Japanese (Shift-JIS)
* - 936: Simplified Chinese (GBK)
*
* @var ?int
*/
public readonly ?int $msgFallbackCodepage = null
) { }
}
/**
* Main extraction configuration.
*
* This struct contains all configuration options for the extraction process.
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
*
* # Example
*/
final class ExtractionConfig
{
public function __construct(
/** @var bool Enable caching of extraction results */
public readonly bool $useCache,
/** @var bool Enable quality post-processing */
public readonly bool $enableQualityProcessing,
/** @var bool Force OCR even for searchable PDFs */
public readonly bool $forceOcr,
/**
* Disable OCR entirely, even for images.
*
* When `true`, OCR is skipped for all document types. Images return metadata
* only (dimensions, format, EXIF) without text extraction. PDFs use only
* native text extraction without OCR fallback.
*
* Cannot be `true` simultaneously with `force_ocr`.
*
* *Added in v4.7.0.*
*
* @var bool
*/
public readonly bool $disableOcr,
/**
* Result structure format
*
* Controls whether results are returned in unified format (default) with all
* content in the `content` field, or element-based format with semantic
* elements (for Unstructured-compatible output).
*
* @var ResultFormat
*/
public readonly ResultFormat $resultFormat,
/**
* Content text format (default: Plain).
*
* Controls the format of the extracted content:
* - `Plain`: Raw extracted text (default)
* - `Markdown`: Markdown formatted output
* - `Djot`: Djot markup format (requires djot feature)
* - `Html`: HTML formatted output
*
* When set to a structured format, extraction results will include
* formatted output. The `formatted_content` field may be populated
* when format conversion is applied.
*
* @var OutputFormat
*/
public readonly OutputFormat $outputFormat,
/**
* Run layout detection on the non-OCR PDF markdown path.
*
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
* table, list, and figure detection in the structure pipeline that would
* otherwise rely on font-clustering heuristics alone. Significantly
* improves SF1 (structural F1) at the cost of inference latency
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
* Requires the `layout-detection` feature.
*
* @var bool
*/
public readonly bool $useLayoutForMarkdown,
/**
* Enable structured document tree output.
*
* When true, populates the `document` field on `ExtractionResult` with a
* hierarchical `DocumentStructure` containing heading-driven section nesting,
* table grids, content layer classification, and inline annotations.
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*
* @var bool
*/
public readonly bool $includeDocumentStructure,
/**
* Maximum recursion depth for archive extraction (default: 3).
* Set to 0 to disable recursive extraction (legacy behavior).
*
* @var int
*/
public readonly int $maxArchiveDepth,
/** @var ?OcrConfig OCR configuration (None = OCR disabled) */
public readonly ?OcrConfig $ocr = null,
/**
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
*
* When set, only the listed pages are OCR'd regardless of text layer quality.
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
* Only applies to PDF documents. Duplicates are automatically deduplicated.
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
*
* @var ?array<int>
*/
public readonly ?array $forceOcrPages = null,
/** @var ?ChunkingConfig Text chunking configuration (None = chunking disabled) */
public readonly ?ChunkingConfig $chunking = null,
/**
* Content filtering configuration (None = use extractor defaults).
*
* Controls whether document "furniture" (headers, footers, watermarks,
* repeating text) is included in or stripped from extraction results.
* See [`ContentFilterConfig`] for per-field documentation.
*
* @var ?ContentFilterConfig
*/
public readonly ?ContentFilterConfig $contentFilter = null,
/** @var ?ImageExtractionConfig Image extraction configuration (None = no image extraction) */
public readonly ?ImageExtractionConfig $images = null,
/** @var ?PdfConfig PDF-specific options (None = use defaults) */
public readonly ?PdfConfig $pdfOptions = null,
/** @var ?TokenReductionOptions Token reduction configuration (None = no token reduction) */
public readonly ?TokenReductionOptions $tokenReduction = null,
/** @var ?LanguageDetectionConfig Language detection configuration (None = no language detection) */
public readonly ?LanguageDetectionConfig $languageDetection = null,
/** @var ?PageConfig Page extraction configuration (None = no page tracking) */
public readonly ?PageConfig $pages = null,
/** @var ?KeywordConfig Keyword extraction configuration (None = no keyword extraction) */
public readonly ?KeywordConfig $keywords = null,
/** @var ?PostProcessorConfig Post-processor configuration (None = use defaults) */
public readonly ?PostProcessorConfig $postprocessor = null,
/**
* HTML to Markdown conversion options (None = use defaults)
*
* Configure how HTML documents are converted to Markdown, including heading styles,
* list formatting, code block styles, and preprocessing options.
*
* @var ?string
*/
public readonly ?string $htmlOptions = null,
/**
* Styled HTML output configuration.
*
* When set alongside `output_format = OutputFormat::Html`, the extraction
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
* which emits stable `kb-*` CSS class hooks on every structural element
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
*
* When `None`, the existing plain comrak-based HTML renderer is used.
*
* @var ?HtmlOutputConfig
*/
public readonly ?HtmlOutputConfig $htmlOutput = null,
/**
* Default per-file timeout in seconds for batch extraction.
*
* When set, each file in a batch will be canceled after this duration
* unless overridden by [`FileExtractionConfig::timeout_secs`].
*
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
* nested archives, documents with millions of cells) from running
* indefinitely and exhausting caller resources. Set to `None` to
* disable the timeout for trusted input or long-running workloads.
*
* @var ?int
*/
public readonly ?int $extractionTimeoutSecs = null,
/**
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
*
* Limits parallelism to prevent resource exhaustion when processing
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
*
* @var ?int
*/
public readonly ?int $maxConcurrentExtractions = null,
/**
* Security limits for archive extraction.
*
* Controls maximum archive size, compression ratio, file count, and other
* security thresholds to prevent decompression bomb attacks. Also caps
* nesting depth, iteration count, entity / token length, total
* content size, and table cell count for every extraction path that
* ingests user-controlled bytes.
* When `None`, default limits are used.
*
* @var ?SecurityLimits
*/
public readonly ?SecurityLimits $securityLimits = null,
/**
* Maximum uncompressed size in bytes for a single embedded file before
* recursive extraction is attempted (default: 50 MiB).
*
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
* to email attachments processed via recursive extraction. Files that
* exceed this limit are skipped with a `ProcessingWarning` rather than
* passed to the extraction pipeline, preventing a single oversized
* embedded object from consuming unbounded memory or time.
*
* Set to `None` to disable the per-embedded-file cap (falls back to
* `security_limits.max_archive_size` as the only guard).
*
* @var ?int
*/
public readonly ?int $maxEmbeddedFileBytes = null,
/**
* Layout detection configuration (None = layout detection disabled).
*
* When set, PDF pages and images are analyzed for document structure
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
* in the markdown pipeline. For images, per-region OCR is performed with
* markdown formatting based on detected layout classes.
* Requires the `layout-detection` feature to run inference; the field is
* present whenever the `layout-types` feature is active (which includes
* `layout-detection` as well as the no-ORT target groups).
*
* @var ?LayoutDetectionConfig
*/
public readonly ?LayoutDetectionConfig $layout = null,
/**
* Hardware acceleration configuration for ONNX Runtime models.
*
* Controls execution provider selection for layout detection and embedding
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
* Linux, CPU on Windows).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Cache namespace for tenant isolation.
*
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
* Different namespaces have isolated cache spaces on the same filesystem.
*
* @var ?string
*/
public readonly ?string $cacheNamespace = null,
/**
* Per-request cache TTL in seconds.
*
* Overrides the global `max_age_days` for this specific extraction.
* When `0`, caching is completely skipped (no read or write).
* When `None`, the global TTL applies.
*
* @var ?int
*/
public readonly ?int $cacheTtlSecs = null,
/**
* Email extraction configuration (None = use defaults).
*
* Currently supports configuring the fallback codepage for MSG files
* that do not specify one. See `EmailConfig` for details.
*
* @var ?EmailConfig
*/
public readonly ?EmailConfig $email = null,
/**
* Concurrency limits for constrained environments (None = use defaults).
*
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
* (when `max_concurrent_extractions` is unset) the batch concurrency
* semaphore. See `ConcurrencyConfig` for details.
*
* @var ?string
*/
public readonly ?string $concurrency = null,
/**
* Tree-sitter language pack configuration (None = tree-sitter disabled).
*
* When set, enables code file extraction using tree-sitter parsers.
* Controls grammar download behavior and code analysis options.
*
* @var ?TreeSitterConfig
*/
public readonly ?TreeSitterConfig $treeSitter = null,
/**
* Structured extraction via LLM (None = disabled).
*
* When set, the extracted document content is sent to an LLM with the
* provided JSON schema. The structured response is stored in
* `ExtractionResult::structured_output`.
*
* @var ?StructuredExtractionConfig
*/
public readonly ?StructuredExtractionConfig $structuredExtraction = null,
/**
* Cancellation token for this extraction (None = no external cancellation).
*
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
* from another thread / task to abort the extraction in progress. The extractor
* checks the token at safe checkpoints (before lock acquisition, between pages,
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
*
* The field is excluded from serialization because `CancellationToken` is a
* runtime handle, not a configuration value.
*
* @var ?string
*/
public readonly ?string $cancelToken = null
) { }
public static function default(): ExtractionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function needsImageProcessing(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Per-file extraction configuration overrides for batch processing.
*
* All fields are `Option<T>` — `None` means "use the batch-level default."
* This type is used with `batch_extract_files` and
* `batch_extract_bytes` to allow heterogeneous
* extraction settings within a single batch.
*
* # Excluded Fields
*
* The following `ExtractionConfig` fields are batch-level only and
* cannot be overridden per file:
* - `max_concurrent_extractions` — controls batch parallelism
* - `use_cache` — global caching policy
* - `acceleration` — shared ONNX execution provider
* - `security_limits` — global archive security policy
*
* # Example
*/
final class FileExtractionConfig
{
public function __construct(
/** @var ?bool Override quality post-processing for this file. */
public readonly ?bool $enableQualityProcessing = null,
/** @var ?OcrConfig Override OCR configuration for this file (None in the Option = use batch default). */
public readonly ?OcrConfig $ocr = null,
/** @var ?bool Override force OCR for this file. */
public readonly ?bool $forceOcr = null,
/** @var ?array<int> Override force OCR pages for this file (1-indexed page numbers). */
public readonly ?array $forceOcrPages = null,
/** @var ?bool Override disable OCR for this file. */
public readonly ?bool $disableOcr = null,
/** @var ?ChunkingConfig Override chunking configuration for this file. */
public readonly ?ChunkingConfig $chunking = null,
/** @var ?ContentFilterConfig Override content filtering configuration for this file. */
public readonly ?ContentFilterConfig $contentFilter = null,
/** @var ?ImageExtractionConfig Override image extraction configuration for this file. */
public readonly ?ImageExtractionConfig $images = null,
/** @var ?PdfConfig Override PDF options for this file. */
public readonly ?PdfConfig $pdfOptions = null,
/** @var ?TokenReductionOptions Override token reduction for this file. */
public readonly ?TokenReductionOptions $tokenReduction = null,
/** @var ?LanguageDetectionConfig Override language detection for this file. */
public readonly ?LanguageDetectionConfig $languageDetection = null,
/** @var ?PageConfig Override page extraction for this file. */
public readonly ?PageConfig $pages = null,
/** @var ?KeywordConfig Override keyword extraction for this file. */
public readonly ?KeywordConfig $keywords = null,
/** @var ?PostProcessorConfig Override post-processor for this file. */
public readonly ?PostProcessorConfig $postprocessor = null,
/** @var ?string Override HTML conversion options for this file. */
public readonly ?string $htmlOptions = null,
/** @var ?ResultFormat Override result format for this file. */
public readonly ?ResultFormat $resultFormat = null,
/** @var ?OutputFormat Override output content format for this file. */
public readonly ?OutputFormat $outputFormat = null,
/** @var ?bool Override document structure output for this file. */
public readonly ?bool $includeDocumentStructure = null,
/** @var ?LayoutDetectionConfig Override layout detection for this file. */
public readonly ?LayoutDetectionConfig $layout = null,
/**
* Override per-file extraction timeout in seconds.
*
* When set, the extraction for this file will be canceled after the
* specified duration. A timed-out file produces an error result without
* affecting other files in the batch.
*
* @var ?int
*/
public readonly ?int $timeoutSecs = null,
/** @var ?TreeSitterConfig Override tree-sitter configuration for this file. */
public readonly ?TreeSitterConfig $treeSitter = null,
/**
* Override structured extraction configuration for this file.
*
* When set, enables LLM-based structured extraction with a JSON schema
* for this specific file. The extracted content is sent to a VLM/LLM
* and the response is parsed according to the provided schema.
*
* @var ?StructuredExtractionConfig
*/
public readonly ?StructuredExtractionConfig $structuredExtraction = null
) { }
}
/**
* Batch item for byte array extraction.
*
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
* to represent a single item in a batch extraction job.
*/
final class BatchBytesItem
{
public function __construct(
/** @var string The content bytes to extract from */
public readonly string $content,
/** @var string MIME type of the content (e.g., "application/pdf", "text/html") */
public readonly string $mimeType,
/** @var ?FileExtractionConfig Per-item configuration overrides (None uses batch-level defaults) */
public readonly ?FileExtractionConfig $config = null
) { }
}
/**
* Batch item for file extraction.
*
* Used with `batch_extract_files` and `batch_extract_files_sync`
* to represent a single file in a batch extraction job.
*/
final class BatchFileItem
{
public function __construct(
/** @var string Path to the file to extract from */
public readonly string $path,
/** @var ?FileExtractionConfig Per-file configuration overrides (None uses batch-level defaults) */
public readonly ?FileExtractionConfig $config = null
) { }
}
/**
* Image extraction configuration.
*/
final class ImageExtractionConfig
{
public function __construct(
/** @var bool Extract images from documents */
public readonly bool $extractImages,
/** @var int Target DPI for image normalization */
public readonly int $targetDpi,
/** @var int Maximum dimension for images (width or height) */
public readonly int $maxImageDimension,
/**
* Whether to inject image reference placeholders into markdown output.
* When `true` (default), image references like `![Image 1](embedded:p1_i0)`
* are appended to the markdown. Set to `false` to extract images as data
* without polluting the markdown output.
*
* @var bool
*/
public readonly bool $injectPlaceholders,
/** @var bool Automatically adjust DPI based on image content */
public readonly bool $autoAdjustDpi,
/** @var int Minimum DPI threshold */
public readonly int $minDpi,
/** @var int Maximum DPI threshold */
public readonly int $maxDpi,
/**
* When `true` (default), extracted images are classified by kind and grouped
* into clusters where they appear to belong to one figure.
*
* @var bool
*/
public readonly bool $classify,
/**
* When `true`, full-page renders produced during OCR preprocessing are captured
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
*
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
* document-level OCR bypass is active (whole-document backend). When OCR is
* enabled and this flag is set but the active backend skips per-page rendering,
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
*
* Defaults to `false`. Enable when downstream consumers need page thumbnails
* (e.g. citation previews, visual grounding).
*
* @var bool
*/
public readonly bool $includePageRasters,
/**
* Run OCR on extracted images and include the recognized text in the document content.
*
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
* are processed with the configured OCR backend. Set to `false` to extract images
* without OCR processing, even when OCR is enabled.
*
* @var bool
*/
public readonly bool $runOcrOnImages,
/**
* When `true`, image OCR results are rendered as plain text without the
* `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
* is also `true`.
*
* @var bool
*/
public readonly bool $ocrTextOnly,
/**
* When `true` and `ocr_text_only` is `false`, append the OCR text after
* the image placeholder in the rendered output.
*
* @var bool
*/
public readonly bool $appendOcrText,
/**
* Maximum number of image objects to extract per PDF page.
*
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
* can trigger extremely long or indefinite extraction times when every image
* object on a dense page is decoded individually via the PDF extractor. Setting this
* limit causes kreuzberg to stop collecting individual images once the count
* per page reaches the cap and emit a warning instead.
*
* `None` (default) means no limit — all images are extracted.
*
* @var ?int
*/
public readonly ?int $maxImagesPerPage = null
) { }
public static function default(): ImageExtractionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Token reduction configuration.
*/
final class TokenReductionOptions
{
public function __construct(
/** @var string Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
public readonly string $mode,
/** @var bool Preserve important words (capitalized, technical terms) */
public readonly bool $preserveImportantWords
) { }
public static function default(): TokenReductionOptions
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Language detection configuration.
*/
final class LanguageDetectionConfig
{
public function __construct(
/** @var bool Enable language detection */
public readonly bool $enabled,
/** @var float Minimum confidence threshold (0.0-1.0) */
public readonly float $minConfidence,
/** @var bool Detect multiple languages in the document */
public readonly bool $detectMultiple
) { }
public static function default(): LanguageDetectionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for styled HTML output.
*
* When set on `ExtractionConfig.html_output` alongside
* `output_format = OutputFormat.Html`, the pipeline builds a
* `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
* the plain comrak-based renderer.
*
* # Example
*/
final class HtmlOutputConfig
{
public function __construct(
/** @var HtmlTheme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
public readonly HtmlTheme $theme,
/**
* CSS class prefix applied to every emitted class name.
*
* Default: `"kb-"`. Change this if your host application already uses
* classes that start with `kb-`.
*
* @var string
*/
public readonly string $classPrefix,
/**
* When `true` (default), write the resolved CSS into a `<style>` block
* immediately after the opening `<div class="{prefix}doc">`.
*
* Set to `false` to emit only the structural markup and wire up your
* own stylesheet targeting the `kb-*` class names.
*
* @var bool
*/
public readonly bool $embedCss,
/**
* Inline CSS string injected into the output after the theme stylesheet.
* Concatenated after `css_file` content when both are set.
*
* @var ?string
*/
public readonly ?string $css = null,
/**
* Path to a CSS file loaded once at renderer construction time.
* Concatenated before `css` when both are set.
*
* @var ?string
*/
public readonly ?string $cssFile = null
) { }
public static function default(): HtmlOutputConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Layout detection configuration.
*
* Controls layout detection behavior in the extraction pipeline.
* When set on `ExtractionConfig`(super.ExtractionConfig), layout detection
* is enabled for PDF extraction.
*/
final class LayoutDetectionConfig
{
public function __construct(
/** @var bool Whether to apply postprocessing heuristics (default: true). */
public readonly bool $applyHeuristics,
/**
* Table structure recognition model.
*
* Controls which model is used for table cell detection within layout-detected
* table regions. Defaults to [`TableModel::Tatr`].
*
* @var TableModel
*/
public readonly TableModel $tableModel,
/** @var ?float Confidence threshold override (None = use model default). */
public readonly ?float $confidenceThreshold = null,
/**
* Hardware acceleration for ONNX models (layout detection + table structure).
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null
) { }
public static function default(): LayoutDetectionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for an LLM provider/model via liter-llm.
*
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
* its own `LlmConfig`, allowing different providers per feature.
*
* # Example
*
* ```toml
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
* ```
*/
final class LlmConfig
{
public function __construct(
/**
* Provider/model string using liter-llm routing format.
*
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
* `"groq/llama-3.1-70b-versatile"`.
*
* @var string
*/
public readonly string $model,
/**
* API key for the provider. When `None`, liter-llm falls back to
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
*
* @var ?string
*/
public readonly ?string $apiKey = null,
/** @var ?string Custom base URL override for the provider endpoint. */
public readonly ?string $baseUrl = null,
/** @var ?int Request timeout in seconds (default: 60). */
public readonly ?int $timeoutSecs = null,
/** @var ?int Maximum retry attempts (default: 3). */
public readonly ?int $maxRetries = null,
/** @var ?float Sampling temperature for generation tasks. */
public readonly ?float $temperature = null,
/** @var ?int Maximum tokens to generate. */
public readonly ?int $maxTokens = null
) { }
}
/**
* Configuration for LLM-based structured data extraction.
*
* Sends extracted document content to a VLM with a JSON schema,
* returning structured data that conforms to the schema.
*
* # Example
*
* ```toml
* [structured_extraction]
* schema_name = "invoice_data"
* strict = true
*
* [structured_extraction.schema]
* type = "object"
* properties.vendor = { type = "string" }
* properties.total = { type = "number" }
* required = ["vendor", "total"]
*
* [structured_extraction.llm]
* model = "openai/gpt-4o"
* ```
*/
final class StructuredExtractionConfig
{
public function __construct(
/** @var string JSON Schema defining the desired output structure. */
public readonly string $schema,
/** @var string Schema name passed to the LLM's structured output mode. */
public readonly string $schemaName,
/** @var bool Enable strict mode — output must exactly match the schema. */
public readonly bool $strict,
/** @var LlmConfig LLM configuration for the extraction. */
public readonly LlmConfig $llm,
/** @var ?string Optional schema description for the LLM. */
public readonly ?string $schemaDescription = null,
/**
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
*
* Available template variables:
* - `{{ content }}` — The extracted document text.
* - `{{ schema }}` — The JSON schema as a formatted string.
* - `{{ schema_name }}` — The schema name.
* - `{{ schema_description }}` — The schema description (may be empty).
*
* @var ?string
*/
public readonly ?string $prompt = null
) { }
}
/**
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
*
* All fields default to the values that match the previous hardcoded behavior,
* so `OcrQualityThresholds.default()` preserves existing semantics exactly.
*/
final class OcrQualityThresholds
{
public function __construct(
/** @var int Minimum total non-whitespace characters to consider text substantive. */
public readonly int $minTotalNonWhitespace,
/** @var float Minimum non-whitespace characters per page on average. */
public readonly float $minNonWhitespacePerPage,
/** @var int Minimum character count for a word to be "meaningful". */
public readonly int $minMeaningfulWordLen,
/** @var int Minimum count of meaningful words before text is accepted. */
public readonly int $minMeaningfulWords,
/** @var float Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
public readonly float $minAlnumRatio,
/** @var int Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
public readonly int $minGarbageChars,
/** @var float Maximum fraction of short (1-2 char) words before text is considered fragmented. */
public readonly float $maxFragmentedWordRatio,
/**
* Critical fragmentation threshold — triggers OCR regardless of meaningful words.
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
*
* @var float
*/
public readonly float $criticalFragmentedWordRatio,
/** @var float Minimum average word length. Below this with enough words indicates garbled extraction. */
public readonly float $minAvgWordLength,
/** @var int Minimum word count before average word length check applies. */
public readonly int $minWordsForAvgLengthCheck,
/** @var float Minimum consecutive word repetition ratio to detect column scrambling. */
public readonly float $minConsecutiveRepeatRatio,
/** @var int Minimum word count before consecutive repetition check is applied. */
public readonly int $minWordsForRepeatCheck,
/** @var int Minimum character count for "substantive markdown" OCR skip gate. */
public readonly int $substantiveMinChars,
/** @var int Minimum character count for "non-text content" OCR skip gate. */
public readonly int $nonTextMinChars,
/** @var float Alphanumeric+whitespace ratio threshold for skip decisions. */
public readonly float $alnumWsRatioThreshold,
/**
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
* If the result from a backend scores below this, try the next backend.
*
* @var float
*/
public readonly float $pipelineMinQuality
) { }
public static function default(): OcrQualityThresholds
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A single backend stage in the OCR pipeline.
*/
final class OcrPipelineStage
{
public function __construct(
/** @var string Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
public readonly string $backend,
/** @var int Priority weight (higher = tried first). Stages are sorted by priority descending. */
public readonly int $priority,
/** @var ?string Language override for this stage (None = use parent OcrConfig.language). */
public readonly ?string $language = null,
/** @var ?TesseractConfig Tesseract-specific config override for this stage. */
public readonly ?TesseractConfig $tesseractConfig = null,
/** @var ?string PaddleOCR-specific config for this stage. */
public readonly ?string $paddleOcrConfig = null,
/** @var ?LlmConfig VLM config override for this pipeline stage. */
public readonly ?LlmConfig $vlmConfig = null,
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Backends that support runtime tuning (mode switching, preprocessing
* flags, inference parameters, etc.) read this value and deserialize
* the keys they care about. Keys unknown to the backend are silently
* ignored, so options from different backends can coexist in the same
* config without conflict.
*
* Example (custom backend):
* ```json
* { "mode": "fast", "enable_layout": true }
* ```
*
* @var ?string
*/
public readonly ?string $backendOptions = null
) { }
}
/**
* Multi-backend OCR pipeline with quality-based fallback.
*
* Backends are tried in priority order (highest first). After each backend
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
* the result is accepted. Otherwise the next backend is tried.
*/
final class OcrPipelineConfig
{
public function __construct(
/** @var array<OcrPipelineStage> Ordered list of backends to try. Sorted by priority (descending) at runtime. */
public readonly array $stages,
/** @var OcrQualityThresholds Quality thresholds for deciding whether to accept a result or try the next backend. */
public readonly OcrQualityThresholds $qualityThresholds
) { }
}
/**
* OCR configuration.
*/
final class OcrConfig
{
public function __construct(
/**
* Whether OCR is enabled.
*
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
* metadata only; PDFs use native text extraction without OCR fallback.
*
* Defaults to `true`. When `false`, all other OCR settings are ignored.
*
* @var bool
*/
public readonly bool $enabled,
/** @var string OCR backend: tesseract, easyocr, paddleocr */
public readonly string $backend,
/** @var string Language code (e.g., "eng", "deu") */
public readonly string $language,
/**
* Enable automatic page rotation based on orientation detection.
*
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
* page orientation (0/90/180/270 degrees) before OCR. If the page is
* rotated with high confidence, the image is corrected before recognition.
* This is critical for handling rotated scanned documents.
*
* @var bool
*/
public readonly bool $autoRotate,
/** @var ?TesseractConfig Tesseract-specific configuration (optional) */
public readonly ?TesseractConfig $tesseractConfig = null,
/** @var ?OutputFormat Output format for OCR results (optional, for format conversion) */
public readonly ?OutputFormat $outputFormat = null,
/** @var ?string PaddleOCR-specific configuration (optional, JSON passthrough) */
public readonly ?string $paddleOcrConfig = null,
/**
* Arbitrary per-call options passed through to the backend unchanged.
*
* Custom OCR backends and built-in backends that support runtime tuning
* can read this value and deserialize the keys they care about. Keys
* unknown to the backend are silently ignored.
*
* This is the recommended extension point for per-call parameters that
* are not covered by the typed fields above (e.g. mode switching,
* preprocessing flags, inference batch size).
*
* **Scope:** when `pipeline` is `None`, this value is propagated to the
* primary stage of the auto-constructed pipeline. When `pipeline` is
* explicitly set, this field has **no effect** — the caller must set
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
* instead.
*
* Example:
* ```json
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
* ```
*
* @var ?string
*/
public readonly ?string $backendOptions = null,
/** @var ?OcrElementConfig OCR element extraction configuration */
public readonly ?OcrElementConfig $elementConfig = null,
/**
* Quality thresholds for the native-text-to-OCR fallback decision.
* When None, uses compiled defaults (matching previous hardcoded behavior).
*
* @var ?OcrQualityThresholds
*/
public readonly ?OcrQualityThresholds $qualityThresholds = null,
/**
* Multi-backend OCR pipeline configuration. When set, enables weighted
* fallback across multiple OCR backends based on output quality.
* When None, uses the single `backend` field (same as today).
*
* @var ?OcrPipelineConfig
*/
public readonly ?OcrPipelineConfig $pipeline = null,
/**
* VLM (Vision Language Model) OCR configuration.
*
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
* images to a vision model for text extraction.
*
* @var ?LlmConfig
*/
public readonly ?LlmConfig $vlmConfig = null,
/**
* Custom Jinja2 prompt template for VLM OCR.
*
* When `None`, uses the default template. Available variables:
* - `{{ language }}` — The document language code (e.g., "eng", "deu").
*
* @var ?string
*/
public readonly ?string $vlmPrompt = null,
/**
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
*
* Not user-configurable via config files — injected at runtime from
* `ExtractionConfig::acceleration` before each `process_image` call.
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Caller-supplied Tesseract `traineddata` bytes per language code.
*
* Primary use case is the WASM build, which has no filesystem and cannot
* download tessdata at runtime. Native builds typically rely on
* `TessdataManager` and ignore this field. When present, the WASM
* Tesseract backend prefers these bytes over its compile-time-bundled
* English data.
*
* Skipped by serde to keep config files small — supply via the typed API
* at runtime.
*
* @var ?array<string, string>
*/
public readonly ?array $tessdataBytes = null
) { }
public static function default(): OcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Page extraction and tracking configuration.
*
* Controls how pages are extracted, tracked, and represented in the extraction results.
* When `None`, page tracking is disabled.
*
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
* when page boundaries are available and chunking is configured.
*/
final class PageConfig
{
public function __construct(
/** @var bool Extract pages as separate array (ExtractionResult.pages) */
public readonly bool $extractPages,
/** @var bool Insert page markers in main content string */
public readonly bool $insertPageMarkers,
/**
* Page marker format (use {page_num} placeholder)
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
*
* @var string
*/
public readonly string $markerFormat
) { }
public static function default(): PageConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* PDF-specific configuration.
*/
final class PdfConfig
{
public function __construct(
/** @var bool Extract images from PDF */
public readonly bool $extractImages,
/**
* Extract tables from PDF.
*
* When `true` (default), runs pdf_oxide's native grid detector and, if it
* finds nothing, falls back to the heuristic text-layer reconstruction in
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
* both passes — `tables` will then be empty in the result.
*
* @var bool
*/
public readonly bool $extractTables,
/** @var bool Extract PDF metadata */
public readonly bool $extractMetadata,
/**
* Extract PDF annotations (text notes, highlights, links, stamps).
* Default: false
*
* @var bool
*/
public readonly bool $extractAnnotations,
/**
* Allow single-column pseudo tables in extraction results.
*
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
* to 1, allowing single-column structured data (glossaries, itemized lists)
* to be emitted as tables. Other quality filters (density, sparsity, prose
* detection) still apply.
*
* @var bool
*/
public readonly bool $allowSingleColumnTables,
/**
* Perform OCR on inline images extracted from PDF pages and attach the
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
* falls back to `TesseractConfig::default()`. Per-image failures degrade
* gracefully (the image is returned without OCR text rather than failing
* the whole extraction). Default: `false`.
*
* @var bool
*/
public readonly bool $ocrInlineImages,
/** @var ?array<string> List of passwords to try when opening encrypted PDFs */
public readonly ?array $passwords = null,
/** @var ?HierarchyConfig Hierarchy extraction configuration (None = hierarchy extraction disabled) */
public readonly ?HierarchyConfig $hierarchy = null,
/**
* Top margin fraction (0.01.0) of page height to exclude headers/running heads.
* Default: 0.06 (6%)
*
* @var ?float
*/
public readonly ?float $topMarginFraction = null,
/**
* Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
* Default: 0.05 (5%)
*
* @var ?float
*/
public readonly ?float $bottomMarginFraction = null
) { }
public static function default(): PdfConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Hierarchy extraction configuration for PDF text structure analysis.
*
* Enables extraction of document hierarchy levels (H1-H6) based on font size
* clustering and semantic analysis. When enabled, hierarchical blocks are
* included in page content.
*/
final class HierarchyConfig
{
public function __construct(
/** @var bool Enable hierarchy extraction */
public readonly bool $enabled,
/**
* Number of font size clusters to use for hierarchy levels (1-7)
*
* Default: 6, which provides H1-H6 heading levels with body text.
* Larger values create more fine-grained hierarchy levels.
*
* @var int
*/
public readonly int $kClusters,
/** @var bool Include bounding box information in hierarchy blocks */
public readonly bool $includeBbox,
/**
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
*
* Determines when OCR should be triggered based on text block coverage.
* OCR is triggered when text blocks cover less than this fraction of the page.
* Default: 0.5 (trigger OCR if less than 50% of page has text)
*
* @var ?float
*/
public readonly ?float $ocrCoverageThreshold = null
) { }
public static function default(): HierarchyConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Post-processor configuration.
*/
final class PostProcessorConfig
{
public function __construct(
/** @var bool Enable post-processors */
public readonly bool $enabled,
/** @var ?array<string> Whitelist of processor names to run (None = all enabled) */
public readonly ?array $enabledProcessors = null,
/** @var ?array<string> Blacklist of processor names to skip (None = none disabled) */
public readonly ?array $disabledProcessors = null,
/** @var ?array<string> Pre-computed AHashSet for O(1) enabled processor lookup */
public readonly ?array $enabledSet = null,
/** @var ?array<string> Pre-computed AHashSet for O(1) disabled processor lookup */
public readonly ?array $disabledSet = null
) { }
public static function default(): PostProcessorConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Chunking configuration.
*
* Configures text chunking for document content, including chunk size,
* overlap, trimming behavior, and optional embeddings.
*
* Use `..Default.default()` when constructing to allow for future field additions:
*/
final class ChunkingConfig
{
public function __construct(
/**
* Maximum size per chunk (in units determined by `sizing`).
*
* When `sizing` is `Characters` (default), this is the max character count.
* When using token-based sizing, this is the max token count.
*
* Default: 1000
*
* @var int
*/
public readonly int $maxCharacters,
/**
* Overlap between chunks (in units determined by `sizing`).
*
* Default: 200
*
* @var int
*/
public readonly int $overlap,
/**
* Whether to trim whitespace from chunk boundaries.
*
* Default: true
*
* @var bool
*/
public readonly bool $trim,
/**
* Type of chunker to use (Text or Markdown).
*
* Default: Text
*
* @var ChunkerType
*/
public readonly ChunkerType $chunkerType,
/**
* How to measure chunk size.
*
* Default: `Characters` (Unicode character count).
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
*
* @var ChunkSizing
*/
public readonly ChunkSizing $sizing,
/**
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
*
* This is useful for RAG pipelines where each chunk needs self-contained
* context about its position in the document structure.
*
* Default: `false`
*
* @var bool
*/
public readonly bool $prependHeadingContext,
/** @var ?EmbeddingConfig Optional embedding configuration for chunk embeddings. */
public readonly ?EmbeddingConfig $embedding = null,
/** @var ?string Use a preset configuration (overrides individual settings if provided). */
public readonly ?string $preset = null,
/**
* Optional cosine similarity threshold for semantic topic boundary detection.
*
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
* provided. You almost never need to set this. When omitted, defaults to
* `0.75` which works well for most documents. Lower values detect more
* topic boundaries (more, smaller chunks); higher values detect fewer.
* Range: `0.0..=1.0`.
*
* @var ?float
*/
public readonly ?float $topicThreshold = null
) { }
public static function default(): ChunkingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Embedding configuration for text chunks.
*
* Configures embedding generation using ONNX models via the vendored embedding engine.
* Requires the `embeddings` feature to be enabled.
*/
final class EmbeddingConfig
{
public function __construct(
/** @var EmbeddingModelType The embedding model to use (defaults to "balanced" preset if not specified) */
public readonly EmbeddingModelType $model,
/** @var bool Whether to normalize embedding vectors (recommended for cosine similarity) */
public readonly bool $normalize,
/** @var int Batch size for embedding generation */
public readonly int $batchSize,
/** @var bool Show model download progress */
public readonly bool $showDownloadProgress,
/**
* Custom cache directory for model files
*
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
* Allows full customization of model download location.
*
* @var ?string
*/
public readonly ?string $cacheDir = null,
/**
* Hardware acceleration for the embedding ONNX model.
*
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
* is used for inference. Defaults to `None` (auto-select per platform).
*
* @var ?AccelerationConfig
*/
public readonly ?AccelerationConfig $acceleration = null,
/**
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
* using [`EmbeddingModelType::Plugin`].
*
* Applies only to the in-process plugin path — protects against hung
* host-language backends (e.g. a Python callback deadlocked on the GIL,
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
* returns `Plugin` instead of blocking forever.
*
* `None` disables the timeout. The default (60 seconds) is conservative
* for common in-process inference; increase for large batches on slow
* hardware.
*
* @var ?int
*/
public readonly ?int $maxEmbedDurationSecs = null
) { }
public static function default(): EmbeddingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Configuration for tree-sitter language pack integration.
*
* Controls grammar download behavior and code analysis options.
*
* # Example (TOML)
*
* ```toml
* [tree_sitter]
* languages = ["python", "rust"]
* groups = ["web"]
*
* [tree_sitter.process]
* structure = true
* comments = true
* docstrings = true
* ```
*/
final class TreeSitterConfig
{
public function __construct(
/**
* Enable code intelligence processing (default: true).
*
* When `false`, tree-sitter analysis is completely skipped even if
* the config section is present.
*
* @var bool
*/
public readonly bool $enabled,
/** @var TreeSitterProcessConfig Processing options for code analysis. */
public readonly TreeSitterProcessConfig $process,
/**
* Custom cache directory for downloaded grammars.
*
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
*
* @var ?string
*/
public readonly ?string $cacheDir = null,
/** @var ?array<string> Languages to pre-download on init (e.g., `["python", "rust"]`). */
public readonly ?array $languages = null,
/** @var ?array<string> Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
public readonly ?array $groups = null
) { }
public static function default(): TreeSitterConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Processing options for tree-sitter code analysis.
*
* Controls which analysis features are enabled when extracting code files.
*/
final class TreeSitterProcessConfig
{
public function __construct(
/** @var bool Extract structural items (functions, classes, structs, etc.). Default: true. */
public readonly bool $structure,
/** @var bool Extract import statements. Default: true. */
public readonly bool $imports,
/** @var bool Extract export statements. Default: true. */
public readonly bool $exports,
/** @var bool Extract comments. Default: false. */
public readonly bool $comments,
/** @var bool Extract docstrings. Default: false. */
public readonly bool $docstrings,
/** @var bool Extract symbol definitions. Default: false. */
public readonly bool $symbols,
/** @var bool Include parse diagnostics. Default: false. */
public readonly bool $diagnostics,
/** @var CodeContentMode Content rendering mode for code extraction. */
public readonly CodeContentMode $contentMode,
/** @var ?int Maximum chunk size in bytes. `None` disables chunking. */
public readonly ?int $chunkMaxSize = null
) { }
public static function default(): TreeSitterProcessConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A supported document format entry.
*
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
*/
final class SupportedFormat
{
public function __construct(
/** @var string File extension (without leading dot), e.g., "pdf", "docx" */
public readonly string $extension,
/** @var string MIME type string, e.g., "application/pdf" */
public readonly string $mimeType
) { }
}
/**
* API server configuration.
*
* This struct holds all configuration options for the Kreuzberg API server,
* including host/port settings, CORS configuration, and upload limits.
*
* # Defaults
*
* - `host`: "127.0.0.1" (localhost only)
* - `port`: 8000
* - `cors_origins`: empty vector (allows all origins)
* - `max_request_body_bytes`: 104_857_600 (100 MB)
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
*/
final class ServerConfig
{
public function __construct(
/** @var string Server host address (e.g., "127.0.0.1", "0.0.0.0") */
public readonly string $host,
/** @var int Server port number */
public readonly int $port,
/**
* CORS allowed origins. Empty vector means allow all origins.
*
* If this is an empty vector, the server will accept requests from any origin.
* If populated with specific origins (e.g., `"https://example.com"`), only
* those origins will be allowed.
*
* @var array<string>
*/
public readonly array $corsOrigins,
/** @var int Maximum size of request body in bytes (default: 100 MB) */
public readonly int $maxRequestBodyBytes,
/** @var int Maximum size of multipart fields in bytes (default: 100 MB) */
public readonly int $maxMultipartFieldBytes
) { }
public static function default(): ServerConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function listenAddr(): string
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function corsAllowsAll(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function isOriginAllowed(string $origin): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function maxRequestBodyMb(): int
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function maxMultipartFieldMb(): int
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
final class StructuredDataResult
{
public function __construct(
/** @var string */
public readonly string $content,
/** @var string */
public readonly string $format,
/** @var array<string, string> */
public readonly array $metadata,
/** @var array<string> */
public readonly array $textFields
) { }
}
/**
* Application properties from docProps/app.xml for DOCX
*
* Contains Word-specific document statistics and metadata.
*/
final class DocxAppProperties
{
public function __construct(
/** @var ?string Application name (e.g., "Microsoft Office Word") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?string Template filename */
public readonly ?string $template = null,
/** @var ?int Total editing time in minutes */
public readonly ?int $totalTime = null,
/** @var ?int Number of pages */
public readonly ?int $pages = null,
/** @var ?int Number of words */
public readonly ?int $words = null,
/** @var ?int Number of characters (excluding spaces) */
public readonly ?int $characters = null,
/** @var ?int Number of characters (including spaces) */
public readonly ?int $charactersWithSpaces = null,
/** @var ?int Number of lines */
public readonly ?int $lines = null,
/** @var ?int Number of paragraphs */
public readonly ?int $paragraphs = null,
/** @var ?string Company name */
public readonly ?string $company = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null
) { }
}
/**
* Application properties from docProps/app.xml for XLSX
*
* Contains Excel-specific document metadata.
*/
final class XlsxAppProperties
{
public function __construct(
/** @var array<string> Worksheet names */
public readonly array $worksheetNames,
/** @var ?string Application name (e.g., "Microsoft Excel") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null,
/** @var ?string Company name */
public readonly ?string $company = null
) { }
}
/**
* Application properties from docProps/app.xml for PPTX
*
* Contains PowerPoint-specific document metadata.
*/
final class PptxAppProperties
{
public function __construct(
/** @var array<string> Slide titles */
public readonly array $slideTitles,
/** @var ?string Application name (e.g., "Microsoft Office PowerPoint") */
public readonly ?string $application = null,
/** @var ?string Application version */
public readonly ?string $appVersion = null,
/** @var ?int Total editing time in minutes */
public readonly ?int $totalTime = null,
/** @var ?string Company name */
public readonly ?string $company = null,
/** @var ?int Document security level */
public readonly ?int $docSecurity = null,
/** @var ?bool Scale crop flag */
public readonly ?bool $scaleCrop = null,
/** @var ?bool Links up to date flag */
public readonly ?bool $linksUpToDate = null,
/** @var ?bool Shared document flag */
public readonly ?bool $sharedDoc = null,
/** @var ?bool Hyperlinks changed flag */
public readonly ?bool $hyperlinksChanged = null,
/** @var ?int Number of slides */
public readonly ?int $slides = null,
/** @var ?int Number of notes */
public readonly ?int $notes = null,
/** @var ?int Number of hidden slides */
public readonly ?int $hiddenSlides = null,
/** @var ?int Number of multimedia clips */
public readonly ?int $multimediaClips = null,
/** @var ?string Presentation format (e.g., "Widescreen", "Standard") */
public readonly ?string $presentationFormat = null
) { }
}
/**
* Dublin Core metadata from docProps/core.xml
*
* Contains standard metadata fields defined by the Dublin Core standard
* and Office-specific extensions.
*/
final class CoreProperties
{
public function __construct(
/** @var ?string Document title */
public readonly ?string $title = null,
/** @var ?string Document subject/topic */
public readonly ?string $subject = null,
/** @var ?string Document creator/author */
public readonly ?string $creator = null,
/** @var ?string Keywords or tags */
public readonly ?string $keywords = null,
/** @var ?string Document description/abstract */
public readonly ?string $description = null,
/** @var ?string User who last modified the document */
public readonly ?string $lastModifiedBy = null,
/** @var ?string Revision number */
public readonly ?string $revision = null,
/** @var ?string Creation timestamp (ISO 8601) */
public readonly ?string $created = null,
/** @var ?string Last modification timestamp (ISO 8601) */
public readonly ?string $modified = null,
/** @var ?string Document category */
public readonly ?string $category = null,
/** @var ?string Content status (Draft, Final, etc.) */
public readonly ?string $contentStatus = null,
/** @var ?string Document language */
public readonly ?string $language = null,
/** @var ?string Unique identifier */
public readonly ?string $identifier = null,
/** @var ?string Document version */
public readonly ?string $version = null,
/** @var ?string Last print timestamp (ISO 8601) */
public readonly ?string $lastPrinted = null
) { }
}
/**
* Configuration for security limits across extractors.
*
* All limits are intentionally conservative to prevent DoS attacks
* while still supporting legitimate documents.
*/
final class SecurityLimits
{
public function __construct(
/** @var int Maximum uncompressed size for archives (500 MB) */
public readonly int $maxArchiveSize,
/** @var int Maximum compression ratio before flagging as potential bomb (100:1) */
public readonly int $maxCompressionRatio,
/** @var int Maximum number of files in archive (10,000) */
public readonly int $maxFilesInArchive,
/** @var int Maximum nesting depth for structures (100) */
public readonly int $maxNestingDepth,
/**
* Maximum length of any single XML entity / attribute / token (1 MiB).
* This is a per-token cap, NOT a total cap — billion-laughs class
* attacks where a single entity expands to hundreds of MB are caught
* here, while normal long text content (a paragraph, a CDATA block) is
* caught by `max_content_size` instead.
*
* @var int
*/
public readonly int $maxEntityLength,
/** @var int Maximum string growth per document (100 MB) */
public readonly int $maxContentSize,
/** @var int Maximum iterations per operation */
public readonly int $maxIterations,
/** @var int Maximum XML depth (100 levels) */
public readonly int $maxXmlDepth,
/** @var int Maximum cells per table (100,000) */
public readonly int $maxTableCells
) { }
public static function default(): SecurityLimits
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
final class TokenReductionConfig
{
public function __construct(
/** @var ReductionLevel */
public readonly ReductionLevel $level,
/** @var bool */
public readonly bool $preserveMarkdown,
/** @var bool */
public readonly bool $preserveCode,
/** @var float */
public readonly float $semanticThreshold,
/** @var bool */
public readonly bool $enableParallel,
/** @var bool */
public readonly bool $useSimd,
/** @var array<string> */
public readonly array $preservePatterns,
/** @var bool */
public readonly bool $enableSemanticClustering,
/** @var ?string */
public readonly ?string $languageHint = null,
/** @var ?array<string, array<string>> */
public readonly ?array $customStopwords = null,
/** @var ?float */
public readonly ?float $targetReduction = null
) { }
public static function default(): TokenReductionConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A PDF annotation extracted from a document page.
*/
final class PdfAnnotation
{
public function __construct(
/** @var PdfAnnotationType The type of annotation. */
public readonly PdfAnnotationType $annotationType,
/** @var int Page number where the annotation appears (1-indexed). */
public readonly int $pageNumber,
/** @var ?string Text content of the annotation (e.g., comment text, link URL). */
public readonly ?string $content = null,
/** @var ?BoundingBox Bounding box of the annotation on the page. */
public readonly ?BoundingBox $boundingBox = null
) { }
}
/**
* Comprehensive Djot document structure with semantic preservation.
*
* This type captures the full richness of Djot markup, including:
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
* - Attributes (classes, IDs, key-value pairs)
* - Links, images, footnotes
* - Math expressions (inline and display)
* - Tables with full structure
*
* Available when the `djot` feature is enabled.
*/
final class DjotContent
{
public function __construct(
/** @var string Plain text representation for backwards compatibility */
public readonly string $plainText,
/** @var array<FormattedBlock> Structured block-level content */
public readonly array $blocks,
/** @var Metadata Metadata from YAML frontmatter */
public readonly Metadata $metadata,
/** @var array<Table> Extracted tables as structured data */
public readonly array $tables,
/** @var array<DjotImage> Extracted images with metadata */
public readonly array $images,
/** @var array<DjotLink> Extracted links with URLs */
public readonly array $links,
/** @var array<Footnote> Footnote definitions */
public readonly array $footnotes,
/** @var array<string> Attributes mapped by element identifier (if present) */
public readonly array $attributes
) { }
}
/**
* Block-level element in a Djot document.
*
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
*/
final class FormattedBlock
{
public function __construct(
/** @var BlockType Type of block element */
public readonly BlockType $blockType,
/** @var array<InlineElement> Inline content within the block */
public readonly array $inlineContent,
/** @var array<FormattedBlock> Nested blocks for containers (blockquotes, list items, divs) */
public readonly array $children,
/** @var ?int Heading level (1-6) for headings, or nesting level for lists */
public readonly ?int $level = null,
/** @var ?string Element attributes (classes, IDs, key-value pairs) */
public readonly ?string $attributes = null,
/** @var ?string Language identifier for code blocks */
public readonly ?string $language = null,
/** @var ?string Raw code content for code blocks */
public readonly ?string $code = null
) { }
}
/**
* Inline element within a block.
*
* Represents text with formatting, links, images, etc.
*/
final class InlineElement
{
public function __construct(
/** @var InlineType Type of inline element */
public readonly InlineType $elementType,
/** @var string Text content */
public readonly string $content,
/** @var ?string Element attributes */
public readonly ?string $attributes = null,
/** @var ?array<string, string> Additional metadata (e.g., href for links, src/alt for images) */
public readonly ?array $metadata = null
) { }
}
/**
* Image element in Djot.
*/
final class DjotImage
{
public function __construct(
/** @var string Image source URL or path */
public readonly string $src,
/** @var string Alternative text */
public readonly string $alt,
/** @var ?string Optional title */
public readonly ?string $title = null,
/** @var ?string Element attributes */
public readonly ?string $attributes = null
) { }
}
/**
* Link element in Djot.
*/
final class DjotLink
{
public function __construct(
/** @var string Link URL */
public readonly string $url,
/** @var string Link text content */
public readonly string $text,
/** @var ?string Optional title */
public readonly ?string $title = null,
/** @var ?string Element attributes */
public readonly ?string $attributes = null
) { }
}
/**
* Footnote in Djot.
*/
final class Footnote
{
public function __construct(
/** @var string Footnote label */
public readonly string $label,
/** @var array<FormattedBlock> Footnote content blocks */
public readonly array $content
) { }
}
/**
* Top-level structured document representation.
*
* A flat array of nodes with index-based parent/child references forming a tree.
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
* to iterate over top-level content by layer.
*
* # Validation
*
* Call `validate()` after construction to verify all node indices are in bounds
* and parent-child relationships are bidirectionally consistent.
*/
final class DocumentStructure
{
public function __construct(
/** @var array<DocumentNode> All nodes in document/reading order. */
public readonly array $nodes,
/**
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
*
* Populated during derivation from the internal document representation.
* Empty when no relationships are detected.
*
* @var array<DocumentRelationship>
*/
public readonly array $relationships,
/**
* Sorted, deduplicated list of node type names present in this document.
*
* Each value is the snake_case `node_type` tag of the corresponding
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
*
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
* Empty until that method is called (internal construction paths call it
* at the end of derivation).
*
* @var array<string>
*/
public readonly array $nodeTypes,
/**
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
*
* Allows renderers to apply format-aware heuristics when converting
* the document tree to output formats.
*
* @var ?string
*/
public readonly ?string $sourceFormat = null
) { }
public function finalizeNodeTypes(): void
{ }
public function isEmpty(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public static function default(): DocumentStructure
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A resolved relationship between two nodes in the document tree.
*/
final class DocumentRelationship
{
public function __construct(
/** @var int Source node index (the referencing node). */
public readonly int $source,
/** @var int Target node index (the referenced node). */
public readonly int $target,
/** @var RelationshipKind Semantic kind of the relationship. */
public readonly RelationshipKind $kind
) { }
}
/**
* A single node in the document tree.
*
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
* for tree structure, and metadata like page number, bounding box, and content layer.
*/
final class DocumentNode
{
public function __construct(
/** @var string Deterministic identifier (hash of content + position). */
public readonly string $id,
/** @var NodeContent Node content — tagged enum, type-specific data only. */
public readonly NodeContent $content,
/** @var array<int> Child node indices in reading order. */
public readonly array $children,
/** @var ContentLayer Content layer classification. */
public readonly ContentLayer $contentLayer,
/**
* Inline annotations (formatting, links) on this node's text content.
*
* Only meaningful for text-carrying nodes; empty for containers.
*
* @var array<TextAnnotation>
*/
public readonly array $annotations,
/** @var ?int Parent node index (`None` = root-level node). */
public readonly ?int $parent = null,
/** @var ?int Page number where this node starts (1-indexed). */
public readonly ?int $page = null,
/** @var ?int Page number where this node ends (for multi-page tables/sections). */
public readonly ?int $pageEnd = null,
/** @var ?BoundingBox Bounding box in document coordinates. */
public readonly ?BoundingBox $bbox = null,
/**
* Format-specific key-value attributes.
*
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
*
* @var ?array<string, string>
*/
public readonly ?array $attributes = null
) { }
}
/**
* Structured table grid with cell-level metadata.
*
* Stores row/column dimensions and a flat list of cells with position info.
*/
final class TableGrid
{
public function __construct(
/** @var int Number of rows in the table. */
public readonly int $rows,
/** @var int Number of columns in the table. */
public readonly int $cols,
/** @var array<GridCell> All cells in row-major order. */
public readonly array $cells
) { }
}
/**
* Individual grid cell with position and span metadata.
*/
final class GridCell
{
public function __construct(
/** @var string Cell text content. */
public readonly string $content,
/** @var int Zero-indexed row position. */
public readonly int $row,
/** @var int Zero-indexed column position. */
public readonly int $col,
/** @var int Number of rows this cell spans. */
public readonly int $rowSpan,
/** @var int Number of columns this cell spans. */
public readonly int $colSpan,
/** @var bool Whether this is a header cell. */
public readonly bool $isHeader,
/** @var ?BoundingBox Bounding box for this cell (if available). */
public readonly ?BoundingBox $bbox = null
) { }
}
/**
* Inline text annotation — byte-range based formatting and links.
*
* Annotations reference byte offsets into the node's text content,
* enabling precise identification of formatted regions.
*/
final class TextAnnotation
{
public function __construct(
/** @var int Start byte offset in the node's text content (inclusive). */
public readonly int $start,
/** @var int End byte offset in the node's text content (exclusive). */
public readonly int $end,
/** @var AnnotationKind Annotation type. */
public readonly AnnotationKind $kind
) { }
}
/**
* General extraction result used by the core extraction API.
*
* This is the main result type returned by all extraction functions.
*/
final class ExtractionResult
{
public function __construct(
/** @var string */
public readonly string $content,
/** @var string */
public readonly string $mimeType,
/** @var Metadata */
public readonly Metadata $metadata,
/** @var array<Table> */
public readonly array $tables,
/**
* Non-fatal warnings collected during processing pipeline stages.
*
* Captures errors from optional pipeline features (embedding, chunking,
* language detection, output formatting) that don't prevent extraction
* but may indicate degraded results.
* Previously stored as individual keys in `metadata.additional`.
*
* @var array<ProcessingWarning>
*/
public readonly array $processingWarnings,
/**
* Extraction strategy used to produce the returned text.
*
* Populated when the extractor can reliably distinguish native text extraction,
* OCR-only extraction, or mixed native/OCR output.
*
* @var ?ExtractionMethod
*/
public readonly ?ExtractionMethod $extractionMethod = null,
/** @var ?array<string> */
public readonly ?array $detectedLanguages = null,
/**
* Text chunks when chunking is enabled.
*
* When chunking configuration is provided, the content is split into
* overlapping chunks for efficient processing. Each chunk contains the text,
* optional embeddings (if enabled), and metadata about its position.
*
* @var ?array<Chunk>
*/
public readonly ?array $chunks = null,
/**
* Extracted images from the document.
*
* When image extraction is enabled via `ImageExtractionConfig`, this field
* contains all images found in the document with their raw data and metadata.
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
*
* @var ?array<ExtractedImage>
*/
public readonly ?array $images = null,
/**
* Per-page content when page extraction is enabled.
*
* When page extraction is configured, the document is split into per-page content
* with tables and images mapped to their respective pages.
*
* @var ?array<PageContent>
*/
public readonly ?array $pages = null,
/**
* Semantic elements when element-based result format is enabled.
*
* When result_format is set to ElementBased, this field contains semantic
* elements with type classification, unique identifiers, and metadata for
* Unstructured-compatible element-based processing.
*
* @var ?array<Element>
*/
public readonly ?array $elements = null,
/**
* Rich Djot content structure (when extracting Djot documents).
*
* When extracting Djot documents with structured extraction enabled,
* this field contains the full semantic structure including:
* - Block-level elements with nesting
* - Inline formatting with attributes
* - Links, images, footnotes
* - Math expressions
* - Complete attribute information
*
* The `content` field still contains plain text for backward compatibility.
*
* Always `None` for non-Djot documents.
*
* @var ?DjotContent
*/
public readonly ?DjotContent $djotContent = null,
/**
* OCR elements with full spatial and confidence metadata.
*
* When OCR is performed with element extraction enabled, this field contains
* the structured representation of detected text including:
* - Bounding geometry (rectangles or quadrilaterals)
* - Confidence scores (detection and recognition)
* - Rotation information
* - Hierarchical relationships (Tesseract only)
*
* This field preserves all metadata that would otherwise be lost when
* converting to plain text or markdown output formats.
*
* Only populated when `OcrElementConfig.include_elements` is true.
*
* @var ?array<OcrElement>
*/
public readonly ?array $ocrElements = null,
/**
* Structured document tree (when document structure extraction is enabled).
*
* When `include_document_structure` is true in `ExtractionConfig`, this field
* contains the full hierarchical representation of the document including:
* - Heading-driven section nesting
* - Table grids with cell-level metadata
* - Content layer classification (body, header, footer, footnote)
* - Inline text annotations (formatting, links)
* - Bounding boxes and page numbers
*
* Independent of `result_format` — can be combined with Unified or ElementBased.
*
* @var ?DocumentStructure
*/
public readonly ?DocumentStructure $document = null,
/**
* Extracted keywords when keyword extraction is enabled.
*
* When keyword extraction (RAKE or YAKE) is configured, this field contains
* the extracted keywords with scores, algorithm info, and position data.
* Previously stored in `metadata.additional["keywords"]`.
*
* @var ?array<Keyword>
*/
public readonly ?array $extractedKeywords = null,
/**
* Document quality score from quality analysis.
*
* A value between 0.0 and 1.0 indicating the overall text quality.
* Previously stored in `metadata.additional["quality_score"]`.
*
* @var ?float
*/
public readonly ?float $qualityScore = null,
/**
* PDF annotations extracted from the document.
*
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
* this field contains text notes, highlights, links, stamps, and other
* annotations found in PDF documents.
*
* @var ?array<PdfAnnotation>
*/
public readonly ?array $annotations = null,
/**
* Nested extraction results from archive contents.
*
* When extracting archives, each processable file inside produces its own
* full extraction result. Set to `None` for non-archive formats.
* Use `max_archive_depth` in config to control recursion depth.
*
* @var ?array<ArchiveEntry>
*/
public readonly ?array $children = null,
/**
* URIs/links discovered during document extraction.
*
* Contains hyperlinks, image references, citations, email addresses, and
* other URI-like references found in the document. Always extracted when
* present in the source document.
*
* @var ?array<ExtractedUri>
*/
public readonly ?array $uris = null,
/**
* Tracked changes embedded in the source document.
*
* Populated by per-format extractors that understand change-tracking
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
* …). Every extractor defaults to `None` until its format-specific
* implementation is added. Extractors that do populate this field follow
* the "accepted-changes" convention: inserted text is present in
* `content`, deleted text is absent — the revision list is the separate
* audit trail.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null,
/**
* Structured extraction output from LLM-based JSON schema extraction.
*
* When `structured_extraction` is configured in `ExtractionConfig`, the
* extracted document content is sent to a VLM with the provided JSON schema.
* The response is parsed and stored here as a JSON value matching the schema.
*
* @var ?string
*/
public readonly ?string $structuredOutput = null,
/**
* Code intelligence results from tree-sitter analysis.
*
* Populated when extracting source code files with the `tree-sitter` feature.
* Contains metrics, structural analysis, imports/exports, comments,
* docstrings, symbols, diagnostics, and optionally chunked code segments.
*
* Stored as an opaque JSON value so that all language bindings (Go, Java,
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
*
* @var ?string
*/
public readonly ?string $codeIntelligence = null,
/**
* LLM token usage and cost data for all LLM calls made during this extraction.
*
* Contains one entry per LLM call. Multiple entries are produced when
* VLM OCR, structured extraction, or LLM embeddings run during
* the same extraction.
*
* `None` when no LLM was used.
*
* @var ?array<LlmUsage>
*/
public readonly ?array $llmUsage = null,
/**
* Pre-rendered content in the requested output format.
*
* Populated during `derive_extraction_result` before tree derivation consumes
* element data. `apply_output_format` swaps this into `content` at the end
* of the pipeline, after post-processors have operated on plain text.
*
* @var ?string
*/
public readonly ?string $formattedContent = null,
/**
* Structured hOCR document for the OCR+layout pipeline.
*
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
* paragraph structure with bounding boxes and confidence scores. The layout
* classification step enriches these elements before final rendering.
*
* @var ?string
*/
public readonly ?string $ocrInternalDocument = null
) { }
public static function fromOcr(OcrExtractionResult $ocr): ExtractionResult
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* A single file extracted from an archive.
*
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
* enabled, each processable file produces its own full `ExtractionResult`.
*/
final class ArchiveEntry
{
public function __construct(
/** @var string Archive-relative file path (e.g. "folder/document.pdf"). */
public readonly string $path,
/** @var string Detected MIME type of the file. */
public readonly string $mimeType,
/** @var ExtractionResult Full extraction result for this file. */
public readonly ExtractionResult $result
) { }
}
/**
* A non-fatal warning from a processing pipeline stage.
*
* Captures errors from optional features that don't prevent extraction
* but may indicate degraded results.
*/
final class ProcessingWarning
{
public function __construct(
/**
* The pipeline stage or feature that produced this warning
* (e.g., "embedding", "chunking", "language_detection", "output_format").
*
* @var string
*/
public readonly string $source,
/** @var string Human-readable description of what went wrong. */
public readonly string $message
) { }
}
/**
* Token usage and cost data for a single LLM call made during extraction.
*
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
* are used. Multiple entries may be present when multiple LLM calls occur
* within one extraction (e.g. VLM OCR + structured extraction).
*/
final class LlmUsage
{
public function __construct(
/** @var string The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
public readonly string $model,
/**
* The pipeline stage that triggered this LLM call
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
*
* @var string
*/
public readonly string $source,
/** @var ?int Number of input/prompt tokens consumed. */
public readonly ?int $inputTokens = null,
/** @var ?int Number of output/completion tokens generated. */
public readonly ?int $outputTokens = null,
/** @var ?int Total tokens (input + output). */
public readonly ?int $totalTokens = null,
/** @var ?float Estimated cost in USD based on the provider's published pricing. */
public readonly ?float $estimatedCost = null,
/** @var ?string Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
public readonly ?string $finishReason = null
) { }
}
/**
* A text chunk with optional embedding and metadata.
*
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
* contains the text content, optional embedding vector (if embedding generation
* is configured), and metadata about its position in the document.
*/
final class Chunk
{
public function __construct(
/** @var string The text content of this chunk. */
public readonly string $content,
/**
* Semantic structural classification of this chunk.
*
* Assigned by the heuristic classifier based on content patterns and
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
*
* @var ChunkType
*/
public readonly ChunkType $chunkType,
/** @var ChunkMetadata Metadata about this chunk's position and properties. */
public readonly ChunkMetadata $metadata,
/**
* Optional embedding vector for this chunk.
*
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
* The dimensionality depends on the chosen embedding model.
*
* @var ?array<float>
*/
public readonly ?array $embedding = null
) { }
}
/**
* Heading context for a chunk within a Markdown document.
*
* Contains the heading hierarchy from document root to this chunk's section.
*/
final class HeadingContext
{
public function __construct(
/**
* The heading hierarchy from document root to this chunk's section.
* Index 0 is the outermost (h1), last element is the most specific.
*
* @var array<HeadingLevel>
*/
public readonly array $headings
) { }
}
/**
* A single heading in the hierarchy.
*/
final class HeadingLevel
{
public function __construct(
/** @var int Heading depth (1 = h1, 2 = h2, etc.) */
public readonly int $level,
/** @var string The text content of the heading. */
public readonly string $text
) { }
}
/**
* Metadata about a chunk's position in the original document.
*/
final class ChunkMetadata
{
public function __construct(
/** @var int Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
public readonly int $byteStart,
/** @var int Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
public readonly int $byteEnd,
/** @var int Zero-based index of this chunk in the document. */
public readonly int $chunkIndex,
/** @var int Total number of chunks in the document. */
public readonly int $totalChunks,
/**
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
*
* Contains zero-based indices into the top-level `images` collection for every
* image whose `page_number` falls within `[first_page, last_page]`.
* Empty when image extraction is disabled or the chunk spans no pages with images.
*
* @var array<int>
*/
public readonly array $imageIndices,
/**
* Number of tokens in this chunk (if available).
*
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
*
* @var ?int
*/
public readonly ?int $tokenCount = null,
/**
* First page number this chunk spans (1-indexed).
*
* Only populated when page tracking is enabled in extraction configuration.
*
* @var ?int
*/
public readonly ?int $firstPage = null,
/**
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
*
* Only populated when page tracking is enabled in extraction configuration.
*
* @var ?int
*/
public readonly ?int $lastPage = null,
/**
* Heading context when using Markdown chunker.
*
* Contains the heading hierarchy this chunk falls under.
* Only populated when `ChunkerType::Markdown` is used.
*
* @var ?HeadingContext
*/
public readonly ?HeadingContext $headingContext = null
) { }
}
/**
* Extracted image from a document.
*
* Contains raw image data, metadata, and optional nested OCR results.
* Raw bytes allow cross-language compatibility - users can convert to
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
*/
final class ExtractedImage
{
public function __construct(
/**
* Raw image data (PNG, JPEG, WebP, etc. bytes).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*
* @var string
*/
public readonly string $data,
/**
* Image format (e.g., "jpeg", "png", "webp")
* Uses Cow<'static, str> to avoid allocation for static literals.
*
* @var string
*/
public readonly string $format,
/** @var int Zero-indexed position of this image in the document/page */
public readonly int $imageIndex,
/** @var bool Whether this image is a mask image */
public readonly bool $isMask,
/** @var ?int Page/slide number where image was found (1-indexed) */
public readonly ?int $pageNumber = null,
/** @var ?int Image width in pixels */
public readonly ?int $width = null,
/** @var ?int Image height in pixels */
public readonly ?int $height = null,
/** @var ?string Colorspace information (e.g., "RGB", "CMYK", "Gray") */
public readonly ?string $colorspace = null,
/** @var ?int Bits per color component (e.g., 8, 16) */
public readonly ?int $bitsPerComponent = null,
/** @var ?string Optional description of the image */
public readonly ?string $description = null,
/**
* Nested OCR extraction result (if image was OCRed)
*
* When OCR is performed on this image, the result is embedded here
* rather than in a separate collection, making the relationship explicit.
*
* @var ?ExtractionResult
*/
public readonly ?ExtractionResult $ocrResult = null,
/**
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
*
* @var ?BoundingBox
*/
public readonly ?BoundingBox $boundingBox = null,
/**
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
* Used for rendering image references when the binary data is not extracted.
*
* @var ?string
*/
public readonly ?string $sourcePath = null,
/**
* Heuristic classification of what this image likely depicts.
* `None` if classification was disabled or inconclusive.
*
* @var ?ImageKind
*/
public readonly ?ImageKind $imageKind = null,
/** @var ?float Confidence score for `image_kind`, in the range 0.0 to 1.0. */
public readonly ?float $kindConfidence = null,
/**
* Identifier shared across images that form a single logical figure
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
*
* @var ?int
*/
public readonly ?int $clusterId = null
) { }
}
/**
* Bounding box coordinates for element positioning.
*/
final class BoundingBox
{
public function __construct(
/** @var float Left x-coordinate */
public readonly float $x0,
/** @var float Bottom y-coordinate */
public readonly float $y0,
/** @var float Right x-coordinate */
public readonly float $x1,
/** @var float Top y-coordinate */
public readonly float $y1
) { }
}
/**
* Metadata for a semantic element.
*/
final class ElementMetadata
{
public function __construct(
/** @var array<string, string> Additional custom metadata */
public readonly array $additional,
/** @var ?int Page number (1-indexed) */
public readonly ?int $pageNumber = null,
/** @var ?string Source filename or document name */
public readonly ?string $filename = null,
/** @var ?BoundingBox Bounding box coordinates if available */
public readonly ?BoundingBox $coordinates = null,
/** @var ?int Position index in the element sequence */
public readonly ?int $elementIndex = null
) { }
}
/**
* Semantic element extracted from document.
*
* Represents a logical unit of content with semantic classification,
* unique identifier, and metadata for tracking origin and position.
*/
final class Element
{
public function __construct(
/** @var string Unique element identifier */
public readonly string $elementId,
/** @var ElementType Semantic type of this element */
public readonly ElementType $elementType,
/** @var string Text content of the element */
public readonly string $text,
/** @var ElementMetadata Metadata about the element */
public readonly ElementMetadata $metadata
) { }
}
/**
* Excel workbook representation.
*
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
* extracted content and metadata.
*/
final class ExcelWorkbook
{
public function __construct(
/** @var array<ExcelSheet> All sheets in the workbook */
public readonly array $sheets,
/** @var array<string, string> Workbook-level metadata (author, creation date, etc.) */
public readonly array $metadata,
/**
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
*
* Populated for legacy shared-workbook `.xlsx` files that contain the
* `xl/revisions/` directory. Each `<header>` element maps to one
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null
) { }
}
/**
* Single Excel worksheet.
*
* Represents one sheet from an Excel workbook with its content
* converted to Markdown format and dimensional statistics.
*/
final class ExcelSheet
{
public function __construct(
/** @var string Sheet name as it appears in Excel */
public readonly string $name,
/** @var string Sheet content converted to Markdown tables */
public readonly string $markdown,
/** @var int Number of rows */
public readonly int $rowCount,
/** @var int Number of columns */
public readonly int $colCount,
/** @var int Total number of non-empty cells */
public readonly int $cellCount,
/**
* Pre-extracted table cells (2D vector of cell values)
* Populated during markdown generation to avoid re-parsing markdown.
* None for empty sheets.
*
* @var ?array<array<string>>
*/
public readonly ?array $tableCells = null
) { }
}
/**
* XML extraction result.
*
* Contains extracted text content from XML files along with
* structural statistics about the XML document.
*/
final class XmlExtractionResult
{
public function __construct(
/** @var string Extracted text content (XML structure filtered out) */
public readonly string $content,
/** @var int Total number of XML elements processed */
public readonly int $elementCount,
/** @var array<string> List of unique element names found (sorted) */
public readonly array $uniqueElements
) { }
}
/**
* Plain text and Markdown extraction result.
*
* Contains the extracted text along with statistics and,
* for Markdown files, structural elements like headers and links.
*/
final class TextExtractionResult
{
public function __construct(
/** @var string Extracted text content */
public readonly string $content,
/** @var int Number of lines */
public readonly int $lineCount,
/** @var int Number of words */
public readonly int $wordCount,
/** @var int Number of characters */
public readonly int $characterCount,
/** @var ?array<string> Markdown headers (text only, Markdown files only) */
public readonly ?array $headers = null,
/** @var ?array<array<string>> Markdown links as (text, URL) tuples (Markdown files only) */
public readonly ?array $links = null,
/** @var ?array<array<string>> Code blocks as (language, code) tuples (Markdown files only) */
public readonly ?array $codeBlocks = null
) { }
}
/**
* PowerPoint (PPTX) extraction result.
*
* Contains extracted slide content, metadata, and embedded images/tables.
*/
final class PptxExtractionResult
{
public function __construct(
/** @var string Extracted text content from all slides */
public readonly string $content,
/** @var PptxMetadata Presentation metadata */
public readonly PptxMetadata $metadata,
/** @var int Total number of slides */
public readonly int $slideCount,
/** @var int Total number of embedded images */
public readonly int $imageCount,
/** @var int Total number of tables */
public readonly int $tableCount,
/** @var array<ExtractedImage> Extracted images from the presentation */
public readonly array $images,
/** @var array<string> Hyperlinks discovered in slides as (url, optional_label) pairs. */
public readonly array $hyperlinks,
/**
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
*
* Contains keys like "title", "author", "created_by", "subject", "keywords",
* "modified_by", "created_at", "modified_at", etc.
*
* @var array<string, string>
*/
public readonly array $officeMetadata,
/** @var ?PageStructure Slide structure with boundaries (when page tracking is enabled) */
public readonly ?PageStructure $pageStructure = null,
/** @var ?array<PageContent> Per-slide content (when page tracking is enabled) */
public readonly ?array $pageContents = null,
/** @var ?DocumentStructure Structured document representation */
public readonly ?DocumentStructure $document = null,
/**
* Slide comments as revisions.
*
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
* `DocumentRevision { kind: Comment }` with author (resolved from
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
*
* @var ?array<DocumentRevision>
*/
public readonly ?array $revisions = null
) { }
}
/**
* Email extraction result.
*
* Complete representation of an extracted email message (.eml or .msg)
* including headers, body content, and attachments.
*/
final class EmailExtractionResult
{
public function __construct(
/** @var array<string> Primary recipient email addresses */
public readonly array $toEmails,
/** @var array<string> CC recipient email addresses */
public readonly array $ccEmails,
/** @var array<string> BCC recipient email addresses */
public readonly array $bccEmails,
/** @var string Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
public readonly string $content,
/** @var array<EmailAttachment> List of email attachments */
public readonly array $attachments,
/** @var array<string, string> Additional email headers and metadata */
public readonly array $metadata,
/** @var ?string Email subject line */
public readonly ?string $subject = null,
/** @var ?string Sender email address */
public readonly ?string $fromEmail = null,
/** @var ?string Email date/timestamp */
public readonly ?string $date = null,
/** @var ?string Message-ID header value */
public readonly ?string $messageId = null,
/** @var ?string Plain text version of the email body */
public readonly ?string $plainText = null,
/** @var ?string HTML version of the email body */
public readonly ?string $htmlContent = null
) { }
}
/**
* Email attachment representation.
*
* Contains metadata and optionally the content of an email attachment.
*/
final class EmailAttachment
{
public function __construct(
/** @var bool Whether this attachment is an image */
public readonly bool $isImage,
/** @var ?string Attachment name (from Content-Disposition header) */
public readonly ?string $name = null,
/** @var ?string Filename of the attachment */
public readonly ?string $filename = null,
/** @var ?string MIME type of the attachment */
public readonly ?string $mimeType = null,
/** @var ?int Size in bytes */
public readonly ?int $size = null,
/**
* Attachment data (if extracted).
* Uses `bytes::Bytes` for cheap cloning of large buffers.
*
* @var ?string
*/
public readonly ?string $data = null
) { }
}
/**
* OCR extraction result.
*
* Result of performing OCR on an image or scanned document,
* including recognized text and detected tables.
*/
final class OcrExtractionResult
{
public function __construct(
/** @var string Recognized text content */
public readonly string $content,
/** @var string Original MIME type of the processed image */
public readonly string $mimeType,
/** @var array<string, string> OCR processing metadata (confidence scores, language, etc.) */
public readonly array $metadata,
/** @var array<OcrTable> Tables detected and extracted via OCR */
public readonly array $tables,
/**
* Structured OCR elements with bounding boxes and confidence scores.
* Available when TSV output is requested or table detection is enabled.
*
* @var ?array<OcrElement>
*/
public readonly ?array $ocrElements = null,
/**
* Structured document produced from hOCR parsing.
* Carries paragraph structure, bounding boxes, and confidence scores
* that the flattened `content` string discards.
*
* @var ?string
*/
public readonly ?string $internalDocument = null
) { }
}
/**
* Table detected via OCR.
*
* Represents a table structure recognized during OCR processing.
*/
final class OcrTable
{
public function __construct(
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
public readonly array $cells,
/** @var string Markdown representation of the table */
public readonly string $markdown,
/** @var int Page number where the table was found (1-indexed) */
public readonly int $pageNumber,
/** @var ?OcrTableBoundingBox Bounding box of the table in pixel coordinates (from OCR word positions). */
public readonly ?OcrTableBoundingBox $boundingBox = null
) { }
}
/**
* Bounding box for an OCR-detected table in pixel coordinates.
*/
final class OcrTableBoundingBox
{
public function __construct(
/** @var int Left x-coordinate (pixels) */
public readonly int $left,
/** @var int Top y-coordinate (pixels) */
public readonly int $top,
/** @var int Right x-coordinate (pixels) */
public readonly int $right,
/** @var int Bottom y-coordinate (pixels) */
public readonly int $bottom
) { }
}
/**
* Image preprocessing configuration for OCR.
*
* These settings control how images are preprocessed before OCR to improve
* text recognition quality. Different preprocessing strategies work better
* for different document types.
*/
final class ImagePreprocessingConfig
{
public function __construct(
/** @var int Target DPI for the image (300 is standard, 600 for small text). */
public readonly int $targetDpi,
/** @var bool Auto-detect and correct image rotation. */
public readonly bool $autoRotate,
/** @var bool Correct skew (tilted images). */
public readonly bool $deskew,
/** @var bool Remove noise from the image. */
public readonly bool $denoise,
/** @var bool Enhance contrast for better text visibility. */
public readonly bool $contrastEnhance,
/** @var string Binarization method: "otsu", "sauvola", "adaptive". */
public readonly string $binarizationMethod,
/** @var bool Invert colors (white text on black → black on white). */
public readonly bool $invertColors
) { }
public static function default(): ImagePreprocessingConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Tesseract OCR configuration.
*
* Provides fine-grained control over Tesseract OCR engine parameters.
* Most users can use the defaults, but these settings allow optimization
* for specific document types (invoices, handwriting, etc.).
*/
final class TesseractConfig
{
public function __construct(
/** @var string Language code (e.g., "eng", "deu", "fra") */
public readonly string $language,
/**
* Page Segmentation Mode (0-13).
*
* Common values:
* - 3: Fully automatic page segmentation (native default)
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
* - 11: Sparse text with no particular order
*
* @var int
*/
public readonly int $psm,
/** @var string Output format ("text" or "markdown") */
public readonly string $outputFormat,
/**
* OCR Engine Mode (0-3).
*
* - 0: Legacy engine only
* - 1: Neural nets (LSTM) only (usually best)
* - 2: Legacy + LSTM
* - 3: Default (based on what's available)
*
* @var int
*/
public readonly int $oem,
/**
* Minimum confidence threshold (0.0-100.0).
*
* Words with confidence below this threshold may be rejected or flagged.
*
* @var float
*/
public readonly float $minConfidence,
/** @var bool Enable automatic table detection and reconstruction */
public readonly bool $enableTableDetection,
/** @var float Minimum confidence threshold for table detection (0.0-1.0) */
public readonly float $tableMinConfidence,
/** @var int Column threshold for table detection (pixels) */
public readonly int $tableColumnThreshold,
/** @var float Row threshold ratio for table detection (0.0-1.0) */
public readonly float $tableRowThresholdRatio,
/** @var bool Enable OCR result caching */
public readonly bool $useCache,
/** @var bool Use pre-adapted templates for character classification */
public readonly bool $classifyUsePreAdaptedTemplates,
/** @var bool Enable N-gram language model */
public readonly bool $languageModelNgramOn,
/** @var bool Don't reject good words during block-level processing */
public readonly bool $tesseditDontBlkrejGoodWds,
/** @var bool Don't reject good words during row-level processing */
public readonly bool $tesseditDontRowrejGoodWds,
/** @var bool Enable dictionary correction */
public readonly bool $tesseditEnableDictCorrection,
/** @var string Whitelist of allowed characters (empty = all allowed) */
public readonly string $tesseditCharWhitelist,
/** @var string Blacklist of forbidden characters (empty = none forbidden) */
public readonly string $tesseditCharBlacklist,
/** @var bool Use primary language params model */
public readonly bool $tesseditUsePrimaryParamsModel,
/** @var bool Variable-width space detection */
public readonly bool $textordSpaceSizeIsVariable,
/** @var bool Use adaptive thresholding method */
public readonly bool $thresholdingMethod,
/**
* Image preprocessing configuration.
*
* Controls how images are preprocessed before OCR. Can significantly
* improve quality for scanned documents or low-quality images.
*
* @var ?ImagePreprocessingConfig
*/
public readonly ?ImagePreprocessingConfig $preprocessing = null
) { }
public static function default(): TesseractConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Image preprocessing metadata.
*
* Tracks the transformations applied to an image during OCR preprocessing,
* including DPI normalization, resizing, and resampling.
*/
final class ImagePreprocessingMetadata
{
public function __construct(
/** @var array<int> Original image dimensions (width, height) in pixels */
public readonly array $originalDimensions,
/** @var array<float> Original image DPI (horizontal, vertical) */
public readonly array $originalDpi,
/** @var int Target DPI from configuration */
public readonly int $targetDpi,
/** @var float Scaling factor applied to the image */
public readonly float $scaleFactor,
/** @var bool Whether DPI was auto-adjusted based on content */
public readonly bool $autoAdjusted,
/** @var int Final DPI after processing */
public readonly int $finalDpi,
/** @var string Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
public readonly string $resampleMethod,
/** @var bool Whether dimensions were clamped to max_image_dimension */
public readonly bool $dimensionClamped,
/** @var bool Whether resize was skipped (dimensions already optimal) */
public readonly bool $skippedResize,
/** @var ?array<int> New dimensions after resizing (if resized) */
public readonly ?array $newDimensions = null,
/** @var ?int Calculated optimal DPI (if auto_adjust_dpi enabled) */
public readonly ?int $calculatedDpi = null,
/** @var ?string Error message if resize failed */
public readonly ?string $resizeError = null
) { }
}
/**
* Extraction result metadata.
*
* Contains common fields applicable to all formats, format-specific metadata
* via a discriminated union, and additional custom fields from postprocessors.
*/
final class Metadata
{
public function __construct(
/**
* Whether OCR was used during extraction.
*
* Set to `true` whenever the extraction pipeline ran an OCR backend
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
* or fallback text. `false` means native text extraction was used exclusively.
*
* @var bool
*/
public readonly bool $ocrUsed,
/**
* Additional custom fields from postprocessors.
*
* Serialized as a nested `"additional"` object (not flattened at root level).
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
*
* @var array<string, string>
*/
public readonly array $additional,
/** @var ?string Document title */
public readonly ?string $title = null,
/** @var ?string Document subject or description */
public readonly ?string $subject = null,
/** @var ?array<string> Primary author(s) - always Vec for consistency */
public readonly ?array $authors = null,
/** @var ?array<string> Keywords/tags - always Vec for consistency */
public readonly ?array $keywords = null,
/** @var ?string Primary language (ISO 639 code) */
public readonly ?string $language = null,
/** @var ?string Creation timestamp (ISO 8601 format) */
public readonly ?string $createdAt = null,
/** @var ?string Last modification timestamp (ISO 8601 format) */
public readonly ?string $modifiedAt = null,
/** @var ?string User who created the document */
public readonly ?string $createdBy = null,
/** @var ?string User who last modified the document */
public readonly ?string $modifiedBy = null,
/** @var ?PageStructure Page/slide/sheet structure with boundaries */
public readonly ?PageStructure $pages = null,
/**
* Format-specific metadata (discriminated union)
*
* Contains detailed metadata specific to the document format.
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
*
* @var ?FormatMetadata
*/
public readonly ?FormatMetadata $format = null,
/** @var ?ImagePreprocessingMetadata Image preprocessing metadata (when OCR preprocessing was applied) */
public readonly ?ImagePreprocessingMetadata $imagePreprocessing = null,
/** @var ?string JSON schema (for structured data extraction) */
public readonly ?string $jsonSchema = null,
/** @var ?ErrorMetadata Error metadata (for batch operations) */
public readonly ?ErrorMetadata $error = null,
/**
* Extraction duration in milliseconds (for benchmarking).
*
* This field is populated by batch extraction to provide per-file timing
* information. It's `None` for single-file extraction (which uses external timing).
*
* @var ?int
*/
public readonly ?int $extractionDurationMs = null,
/** @var ?string Document category (from frontmatter or classification). */
public readonly ?string $category = null,
/** @var ?array<string> Document tags (from frontmatter). */
public readonly ?array $tags = null,
/** @var ?string Document version string (from frontmatter). */
public readonly ?string $documentVersion = null,
/** @var ?string Abstract or summary text (from frontmatter). */
public readonly ?string $abstractText = null,
/**
* Output format identifier (e.g., "markdown", "html", "text").
*
* Set by the output format pipeline stage when format conversion is applied.
* Previously stored in `metadata.additional["output_format"]`.
*
* @var ?string
*/
public readonly ?string $outputFormat = null
) { }
public function isEmpty(): bool
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Excel/spreadsheet format metadata.
*
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
* discriminant. Sheet count and sheet names are stored inside this struct.
*/
final class ExcelMetadata
{
public function __construct(
/** @var ?int Number of sheets in the workbook. */
public readonly ?int $sheetCount = null,
/** @var ?array<string> Names of all sheets in the workbook. */
public readonly ?array $sheetNames = null
) { }
}
/**
* Email metadata extracted from .eml and .msg files.
*
* Includes sender/recipient information, message ID, and attachment list.
*/
final class EmailMetadata
{
public function __construct(
/** @var array<string> Primary recipients */
public readonly array $toEmails,
/** @var array<string> CC recipients */
public readonly array $ccEmails,
/** @var array<string> BCC recipients */
public readonly array $bccEmails,
/** @var array<string> List of attachment filenames */
public readonly array $attachments,
/** @var ?string Sender's email address */
public readonly ?string $fromEmail = null,
/** @var ?string Sender's display name */
public readonly ?string $fromName = null,
/** @var ?string Message-ID header value */
public readonly ?string $messageId = null
) { }
}
/**
* Archive (ZIP/TAR/7Z) metadata.
*
* Extracted from compressed archive files containing file lists and size information.
*/
final class ArchiveMetadata
{
public function __construct(
/** @var string Archive format ("ZIP", "TAR", "7Z", etc.) */
public readonly string $format,
/** @var int Total number of files in the archive */
public readonly int $fileCount,
/** @var array<string> List of file paths within the archive */
public readonly array $fileList,
/** @var int Total uncompressed size in bytes */
public readonly int $totalSize,
/** @var ?int Compressed size in bytes (if available) */
public readonly ?int $compressedSize = null
) { }
}
/**
* Image metadata extracted from image files.
*
* Includes dimensions, format, and EXIF data.
*/
final class ImageMetadata
{
public function __construct(
/** @var int Image width in pixels */
public readonly int $width,
/** @var int Image height in pixels */
public readonly int $height,
/** @var string Image format (e.g., "PNG", "JPEG", "TIFF") */
public readonly string $format,
/** @var array<string, string> EXIF metadata tags */
public readonly array $exif
) { }
}
/**
* XML metadata extracted during XML parsing.
*
* Provides statistics about XML document structure.
*/
final class XmlMetadata
{
public function __construct(
/** @var int Total number of XML elements processed */
public readonly int $elementCount,
/** @var array<string> List of unique element tag names (sorted) */
public readonly array $uniqueElements
) { }
}
/**
* Text/Markdown metadata.
*
* Extracted from plain text and Markdown files. Includes word counts and,
* for Markdown, structural elements like headers and links.
*/
final class TextMetadata
{
public function __construct(
/** @var int Number of lines in the document */
public readonly int $lineCount,
/** @var int Number of words */
public readonly int $wordCount,
/** @var int Number of characters */
public readonly int $characterCount,
/** @var ?array<string> Markdown headers (headings text only, for Markdown files) */
public readonly ?array $headers = null,
/** @var ?array<array<string>> Markdown links as (text, url) tuples (for Markdown files) */
public readonly ?array $links = null,
/** @var ?array<array<string>> Code blocks as (language, code) tuples (for Markdown files) */
public readonly ?array $codeBlocks = null
) { }
}
/**
* Header/heading element metadata.
*/
final class HeaderMetadata
{
public function __construct(
/** @var int Header level: 1 (h1) through 6 (h6) */
public readonly int $level,
/** @var string Normalized text content of the header */
public readonly string $text,
/** @var int Document tree depth at the header element */
public readonly int $depth,
/** @var int Byte offset in original HTML document */
public readonly int $htmlOffset,
/** @var ?string HTML id attribute if present */
public readonly ?string $id = null
) { }
}
/**
* Link element metadata.
*/
final class LinkMetadata
{
public function __construct(
/** @var string The href URL value */
public readonly string $href,
/** @var string Link text content (normalized) */
public readonly string $text,
/** @var LinkType Link type classification */
public readonly LinkType $linkType,
/** @var array<string> Rel attribute values */
public readonly array $rel,
/** @var array<array<string>> Additional attributes as key-value pairs */
public readonly array $attributes,
/** @var ?string Optional title attribute */
public readonly ?string $title = null
) { }
}
/**
* Image element metadata.
*/
final class ImageMetadataType
{
public function __construct(
/** @var string Image source (URL, data URI, or SVG content) */
public readonly string $src,
/** @var ImageType Image type classification */
public readonly ImageType $imageType,
/** @var array<array<string>> Additional attributes as key-value pairs */
public readonly array $attributes,
/** @var ?string Alternative text from alt attribute */
public readonly ?string $alt = null,
/** @var ?string Title attribute */
public readonly ?string $title = null,
/** @var ?array<int> Image dimensions as (width, height) if available */
public readonly ?array $dimensions = null
) { }
}
/**
* Structured data (Schema.org, microdata, RDFa) block.
*/
final class StructuredData
{
public function __construct(
/** @var StructuredDataType Type of structured data */
public readonly StructuredDataType $dataType,
/** @var string Raw JSON string representation */
public readonly string $rawJson,
/** @var ?string Schema type if detectable (e.g., "Article", "Event", "Product") */
public readonly ?string $schemaType = null
) { }
}
/**
* HTML metadata extracted from HTML documents.
*
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
* and extracted structural elements (headers, links, images, structured data).
*/
final class HtmlMetadata
{
public function __construct(
/** @var array<string> Document keywords from `<meta name="keywords">` tag, split on commas */
public readonly array $keywords,
/**
* Open Graph metadata (og:* properties) for social media
* Keys like "title", "description", "image", "url", etc.
*
* @var array<string, string>
*/
public readonly array $openGraph,
/**
* Twitter Card metadata (twitter:* properties)
* Keys like "card", "site", "creator", "title", "description", "image", etc.
*
* @var array<string, string>
*/
public readonly array $twitterCard,
/**
* Additional meta tags not covered by specific fields
* Keys are meta name/property attributes, values are content
*
* @var array<string, string>
*/
public readonly array $metaTags,
/** @var array<HeaderMetadata> Extracted header elements with hierarchy */
public readonly array $headers,
/** @var array<LinkMetadata> Extracted hyperlinks with type classification */
public readonly array $links,
/** @var array<ImageMetadataType> Extracted images with source and dimensions */
public readonly array $images,
/** @var array<StructuredData> Extracted structured data blocks */
public readonly array $structuredData,
/** @var ?string Document title from `<title>` tag */
public readonly ?string $title = null,
/** @var ?string Document description from `<meta name="description">` tag */
public readonly ?string $description = null,
/** @var ?string Document author from `<meta name="author">` tag */
public readonly ?string $author = null,
/** @var ?string Canonical URL from `<link rel="canonical">` tag */
public readonly ?string $canonicalUrl = null,
/** @var ?string Base URL from `<base href="">` tag for resolving relative URLs */
public readonly ?string $baseHref = null,
/** @var ?string Document language from `lang` attribute */
public readonly ?string $language = null,
/** @var ?TextDirection Document text direction from `dir` attribute */
public readonly ?TextDirection $textDirection = null
) { }
}
/**
* OCR processing metadata.
*
* Captures information about OCR processing configuration and results.
*/
final class OcrMetadata
{
public function __construct(
/** @var string OCR language code(s) used */
public readonly string $language,
/** @var int Tesseract Page Segmentation Mode (PSM) */
public readonly int $psm,
/** @var string Output format (e.g., "text", "hocr") */
public readonly string $outputFormat,
/** @var int Number of tables detected */
public readonly int $tableCount,
/** @var ?int */
public readonly ?int $tableRows = null,
/** @var ?int */
public readonly ?int $tableCols = null
) { }
}
/**
* Error metadata (for batch operations).
*/
final class ErrorMetadata
{
public function __construct(
/** @var string */
public readonly string $errorType,
/** @var string */
public readonly string $message
) { }
}
/**
* PowerPoint presentation metadata.
*
* Extracted from PPTX files containing slide counts and presentation details.
*/
final class PptxMetadata
{
public function __construct(
/** @var int Total number of slides in the presentation */
public readonly int $slideCount,
/** @var array<string> Names of slides (if available) */
public readonly array $slideNames,
/** @var ?int Number of embedded images */
public readonly ?int $imageCount = null,
/** @var ?int Number of tables */
public readonly ?int $tableCount = null
) { }
}
/**
* Word document metadata.
*
* Extracted from DOCX files using shared Office Open XML metadata extraction.
* Integrates with `office_metadata` module for core/app/custom properties.
*/
final class DocxMetadata
{
public function __construct(
/**
* Core properties from docProps/core.xml (Dublin Core metadata)
*
* Contains title, creator, subject, keywords, dates, etc.
* Shared format across DOCX/PPTX/XLSX documents.
*
* @var ?CoreProperties
*/
public readonly ?CoreProperties $coreProperties = null,
/**
* Application properties from docProps/app.xml (Word-specific statistics)
*
* Contains word count, page count, paragraph count, editing time, etc.
* DOCX-specific variant of Office application properties.
*
* @var ?DocxAppProperties
*/
public readonly ?DocxAppProperties $appProperties = null,
/**
* Custom properties from docProps/custom.xml (user-defined properties)
*
* Contains key-value pairs defined by users or applications.
* Values can be strings, numbers, booleans, or dates.
*
* @var ?array<string, string>
*/
public readonly ?array $customProperties = null
) { }
}
/**
* CSV/TSV file metadata.
*/
final class CsvMetadata
{
public function __construct(
/** @var int */
public readonly int $rowCount,
/** @var int */
public readonly int $columnCount,
/** @var bool */
public readonly bool $hasHeader,
/** @var ?string */
public readonly ?string $delimiter = null,
/** @var ?array<string> */
public readonly ?array $columnTypes = null
) { }
}
/**
* BibTeX bibliography metadata.
*/
final class BibtexMetadata
{
public function __construct(
/** @var int Number of entries in the bibliography. */
public readonly int $entryCount,
/** @var array<string> */
public readonly array $citationKeys,
/** @var array<string> */
public readonly array $authors,
/** @var ?YearRange */
public readonly ?YearRange $yearRange = null,
/** @var ?array<string, int> */
public readonly ?array $entryTypes = null
) { }
}
/**
* Citation file metadata (RIS, PubMed, EndNote).
*/
final class CitationMetadata
{
public function __construct(
/** @var int */
public readonly int $citationCount,
/** @var array<string> */
public readonly array $authors,
/** @var array<string> */
public readonly array $dois,
/** @var array<string> */
public readonly array $keywords,
/** @var ?string */
public readonly ?string $format = null,
/** @var ?YearRange */
public readonly ?YearRange $yearRange = null
) { }
}
/**
* Year range for bibliographic metadata.
*/
final class YearRange
{
public function __construct(
/** @var array<int> */
public readonly array $years,
/** @var ?int */
public readonly ?int $min = null,
/** @var ?int */
public readonly ?int $max = null
) { }
}
/**
* FictionBook (FB2) metadata.
*/
final class FictionBookMetadata
{
public function __construct(
/** @var array<string> */
public readonly array $genres,
/** @var array<string> */
public readonly array $sequences,
/** @var ?string */
public readonly ?string $annotation = null
) { }
}
/**
* dBASE (DBF) file metadata.
*/
final class DbfMetadata
{
public function __construct(
/** @var int */
public readonly int $recordCount,
/** @var int */
public readonly int $fieldCount,
/** @var array<DbfFieldInfo> */
public readonly array $fields
) { }
}
/**
* dBASE field information.
*/
final class DbfFieldInfo
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var string */
public readonly string $fieldType
) { }
}
/**
* JATS (Journal Article Tag Suite) metadata.
*/
final class JatsMetadata
{
public function __construct(
/** @var array<string, string> */
public readonly array $historyDates,
/** @var array<ContributorRole> */
public readonly array $contributorRoles,
/** @var ?string */
public readonly ?string $copyright = null,
/** @var ?string */
public readonly ?string $license = null
) { }
}
/**
* JATS contributor with role.
*/
final class ContributorRole
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var ?string */
public readonly ?string $role = null
) { }
}
/**
* EPUB metadata (Dublin Core extensions).
*/
final class EpubMetadata
{
public function __construct(
/** @var ?string */
public readonly ?string $coverage = null,
/** @var ?string */
public readonly ?string $dcFormat = null,
/** @var ?string */
public readonly ?string $relation = null,
/** @var ?string */
public readonly ?string $source = null,
/** @var ?string */
public readonly ?string $dcType = null,
/** @var ?string */
public readonly ?string $coverImage = null
) { }
}
/**
* Outlook PST archive metadata.
*/
final class PstMetadata
{
public function __construct(
/** @var int */
public readonly int $messageCount
) { }
}
/**
* Confidence scores for an OCR element.
*
* Separates detection confidence (how confident that text exists at this location)
* from recognition confidence (how confident about the actual text content).
*/
final class OcrConfidence
{
public function __construct(
/**
* Recognition confidence: how confident about the text content.
*
* Range: 0.0 to 1.0.
*
* @var float
*/
public readonly float $recognition,
/**
* Detection confidence: how confident the OCR engine is that text exists here.
*
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
* Range: 0.0 to 1.0 (or None if not available).
*
* @var ?float
*/
public readonly ?float $detection = null
) { }
}
/**
* Rotation information for an OCR element.
*/
final class OcrRotation
{
public function __construct(
/** @var float Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
public readonly float $angleDegrees,
/** @var ?float Confidence score for the rotation detection. */
public readonly ?float $confidence = null
) { }
}
/**
* A unified OCR element representing detected text with full metadata.
*
* This is the primary type for structured OCR output, preserving all information
* from both Tesseract and PaddleOCR backends.
*/
final class OcrElement
{
public function __construct(
/** @var string The recognized text content. */
public readonly string $text,
/** @var OcrBoundingGeometry Bounding geometry (rectangle or quadrilateral). */
public readonly OcrBoundingGeometry $geometry,
/** @var OcrConfidence Confidence scores for detection and recognition. */
public readonly OcrConfidence $confidence,
/** @var OcrElementLevel Hierarchical level (word, line, block, page). */
public readonly OcrElementLevel $level,
/** @var int Page number (1-indexed). */
public readonly int $pageNumber,
/** @var array<string, string> Backend-specific metadata that doesn't fit the unified schema. */
public readonly array $backendMetadata,
/** @var ?OcrRotation Rotation information (if detected). */
public readonly ?OcrRotation $rotation = null,
/**
* Parent element ID for hierarchical relationships.
*
* Only used for Tesseract output which has word -> line -> block hierarchy.
*
* @var ?string
*/
public readonly ?string $parentId = null
) { }
}
/**
* Configuration for OCR element extraction.
*
* Controls how OCR elements are extracted and filtered.
*/
final class OcrElementConfig
{
public function __construct(
/**
* Whether to include OCR elements in the extraction result.
*
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
*
* @var bool
*/
public readonly bool $includeElements,
/**
* Minimum hierarchical level to include.
*
* Elements below this level (e.g., words when min_level is Line) will be excluded.
*
* @var OcrElementLevel
*/
public readonly OcrElementLevel $minLevel,
/**
* Minimum recognition confidence threshold (0.0-1.0).
*
* Elements with confidence below this threshold will be filtered out.
*
* @var float
*/
public readonly float $minConfidence,
/**
* Whether to build hierarchical relationships between elements.
*
* When true, `parent_id` fields will be populated based on spatial containment.
* Only meaningful for Tesseract output.
*
* @var bool
*/
public readonly bool $buildHierarchy
) { }
}
/**
* Unified page structure for documents.
*
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
* with character offset boundaries for chunk-to-page mapping.
*/
final class PageStructure
{
public function __construct(
/** @var int Total number of pages/slides/sheets */
public readonly int $totalCount,
/** @var PageUnitType Type of paginated unit */
public readonly PageUnitType $unitType,
/**
* Character offset boundaries for each page
*
* Maps character ranges in the extracted content to page numbers.
* Used for chunk page range calculation.
*
* @var ?array<PageBoundary>
*/
public readonly ?array $boundaries = null,
/** @var ?array<PageInfo> Detailed per-page metadata (optional, only when needed) */
public readonly ?array $pages = null
) { }
}
/**
* Byte offset boundary for a page.
*
* Tracks where a specific page's content starts and ends in the main content string,
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
*/
final class PageBoundary
{
public function __construct(
/** @var int Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
public readonly int $byteStart,
/** @var int Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
public readonly int $byteEnd,
/** @var int Page number (1-indexed) */
public readonly int $pageNumber
) { }
}
/**
* Metadata for individual page/slide/sheet.
*
* Captures per-page information including dimensions, content counts,
* and visibility state (for presentations).
*/
final class PageInfo
{
public function __construct(
/** @var int Page number (1-indexed) */
public readonly int $number,
/**
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
*
* Indicates the presence of vector-drawn content such as charts, diagrams,
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
* invisible to `ExtractionResult.images` since they are not embedded as raster
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
* signaling that downstream consumers may want to rasterize the page to
* capture this content.
*
* Only populated for PDFs; `None` for other document types.
*
* @var bool
*/
public readonly bool $hasVectorGraphics,
/** @var ?string Page title (usually for presentations) */
public readonly ?string $title = null,
/** @var ?array<float> Dimensions in points (PDF) or pixels (images): (width, height) */
public readonly ?array $dimensions = null,
/** @var ?int Number of images on this page */
public readonly ?int $imageCount = null,
/** @var ?int Number of tables on this page */
public readonly ?int $tableCount = null,
/** @var ?bool Whether this page is hidden (e.g., in presentations) */
public readonly ?bool $hidden = null,
/**
* Whether this page is blank (no meaningful text, no images, no tables)
*
* A page is considered blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images. This is useful for filtering out empty pages
* in scanned documents or PDFs with blank separator pages.
*
* @var ?bool
*/
public readonly ?bool $isBlank = null
) { }
}
/**
* Content for a single page/slide.
*
* When page extraction is enabled, documents are split into per-page content
* with associated tables and images mapped to each page.
*
* # Performance
*
* Uses Arc-wrapped tables and images for memory efficiency:
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
*
* This reduces memory overhead for documents with shared tables/images
* by avoiding redundant copies during serialization.
*/
final class PageContent
{
public function __construct(
/** @var int Page number (1-indexed) */
public readonly int $pageNumber,
/** @var string Text content for this page */
public readonly string $content,
/**
* Tables found on this page (uses Arc for memory efficiency)
*
* Serializes as Vec<Table> for JSON compatibility while maintaining
* Arc semantics in-memory for zero-copy sharing.
*
* @var array<Table>
*/
public readonly array $tables,
/**
* Indices into `ExtractionResult.images` for images found on this page.
*
* Each value is a zero-based index into the top-level `images` collection.
* Only populated when `extract_images = true` in the extraction config.
*
* @var array<int>
*/
public readonly array $imageIndices,
/**
* Hierarchy information for the page (when hierarchy extraction is enabled)
*
* Contains text hierarchy levels (H1-H6) extracted from the page content.
*
* @var ?PageHierarchy
*/
public readonly ?PageHierarchy $hierarchy = null,
/**
* Whether this page is blank (no meaningful text content)
*
* Determined during extraction based on text content analysis.
* A page is blank if it has fewer than 3 non-whitespace characters
* and contains no tables or images.
*
* @var ?bool
*/
public readonly ?bool $isBlank = null,
/**
* Layout detection regions for this page (when layout detection is enabled).
*
* Contains detected layout regions with class, confidence, bounding box,
* and area fraction. Only populated when layout detection is configured.
*
* @var ?array<LayoutRegion>
*/
public readonly ?array $layoutRegions = null,
/**
* Speaker notes for this slide (PPTX only).
*
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
* Only populated when the source is a PPTX file and notes are present.
*
* @var ?string
*/
public readonly ?string $speakerNotes = null,
/**
* Section name this slide belongs to (PPTX only).
*
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
* the slide belongs to a named section.
*
* @var ?string
*/
public readonly ?string $sectionName = null,
/**
* Sheet name for this page (XLSX/ODS only).
*
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
* formats and for sheets with an empty name.
*
* @var ?string
*/
public readonly ?string $sheetName = null
) { }
}
/**
* A detected layout region on a page.
*
* When layout detection is enabled, each page may have layout regions
* identifying different content types (text, pictures, tables, etc.)
* with confidence scores and spatial positions.
*/
final class LayoutRegion
{
public function __construct(
/** @var string Layout class name (e.g. "picture", "table", "text", "section_header"). */
public readonly string $className,
/** @var float Confidence score from the layout detection model (0.0 to 1.0). */
public readonly float $confidence,
/** @var BoundingBox Bounding box in document coordinate space. */
public readonly BoundingBox $boundingBox,
/** @var float Fraction of the page area covered by this region (0.0 to 1.0). */
public readonly float $areaFraction
) { }
}
/**
* Page hierarchy structure containing heading levels and block information.
*
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
* blocks with heading levels (H1-H6) for semantic document structure.
*/
final class PageHierarchy
{
public function __construct(
/** @var int Number of hierarchy blocks on this page */
public readonly int $blockCount,
/** @var array<HierarchicalBlock> Hierarchical blocks with heading levels */
public readonly array $blocks
) { }
}
/**
* A text block with hierarchy level assignment.
*
* Represents a block of text with semantic heading information extracted from
* font size clustering and hierarchical analysis.
*/
final class HierarchicalBlock
{
public function __construct(
/** @var string The text content of this block */
public readonly string $text,
/** @var float The font size of the text in this block */
public readonly float $fontSize,
/**
* The hierarchy level of this block (H1-H6 or Body)
*
* Levels correspond to HTML heading tags:
* - "h1": Top-level heading
* - "h2": Secondary heading
* - "h3": Tertiary heading
* - "h4": Quaternary heading
* - "h5": Quinary heading
* - "h6": Senary heading
* - "body": Body text (no heading level)
*
* @var string
*/
public readonly string $level,
/**
* Bounding box information for the block
*
* Contains coordinates as (left, top, right, bottom) in PDF units.
*
* @var ?array<float>
*/
public readonly ?array $bbox = null
) { }
}
/**
* A single changed cell within a table.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
final class CellChange
{
public function __construct(
/** @var int Zero-based row index. */
public readonly int $row,
/** @var int Zero-based column index. */
public readonly int $col,
/** @var string Value before the change. */
public readonly string $from,
/** @var string Value after the change. */
public readonly string $to
) { }
}
/**
* A single tracked change embedded in a document.
*
* Populated by per-format extractors that understand change-tracking metadata
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
* extractor defaults to `ExtractionResult.revisions = None` until a
* format-specific implementation is added.
*/
final class DocumentRevision
{
public function __construct(
/**
* Format-specific revision identifier.
*
* For DOCX this is the `w:id` attribute value on the change element
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
*
* @var string
*/
public readonly string $revisionId,
/** @var RevisionKind Semantic kind of this revision. */
public readonly RevisionKind $kind,
/** @var RevisionDelta The content changes that make up this revision. */
public readonly RevisionDelta $delta,
/** @var ?string Display name of the author who made this change, when available. */
public readonly ?string $author = null,
/**
* ISO-8601 timestamp of the change, when available.
*
* Stored as a plain string so this type remains FFI-friendly and
* unconditionally available without the `chrono` optional dep.
* DOCX populates this from the `w:date` attribute (e.g.
* `"2024-03-15T10:30:00Z"`).
*
* @var ?string
*/
public readonly ?string $timestamp = null,
/**
* Best-effort document location for this revision.
*
* Resolution is format-dependent and may be `None` when the location
* cannot be determined (e.g. changes inside table cells before
* table-cell anchor support is added).
*
* @var ?RevisionAnchor
*/
public readonly ?RevisionAnchor $anchor = null
) { }
}
/**
* The content changes that make up a single revision.
*
* For insertions and deletions the `content` field carries the added/removed
* lines as `DiffLine.Added` / `DiffLine.Removed` entries. For format
* changes, `content` is empty — the property diff is left as a TODO for a
* later enrichment pass.
*/
final class RevisionDelta
{
public function __construct(
/** @var array<DiffLine> Line-level content changes for this revision. */
public readonly array $content,
/** @var array<CellChange> Cell-level table changes for this revision. */
public readonly array $tableChanges
) { }
}
/**
* Extracted table structure.
*
* Represents a table detected and extracted from a document (PDF, image, etc.).
* Tables are converted to both structured cell data and Markdown format.
*/
final class Table
{
public function __construct(
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
public readonly array $cells,
/** @var string Markdown representation of the table */
public readonly string $markdown,
/** @var int Page number where the table was found (1-indexed) */
public readonly int $pageNumber,
/**
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
* Only populated for PDF-extracted tables when position data is available.
*
* @var ?BoundingBox
*/
public readonly ?BoundingBox $boundingBox = null
) { }
}
/**
* Individual table cell with content and optional styling.
*
* Future extension point for rich table support with cell-level metadata.
*/
final class TableCell
{
public function __construct(
/** @var string Cell content as text */
public readonly string $content,
/** @var int Row span (number of rows this cell spans) */
public readonly int $rowSpan,
/** @var int Column span (number of columns this cell spans) */
public readonly int $colSpan,
/** @var bool Whether this is a header cell */
public readonly bool $isHeader
) { }
}
/**
* A URI extracted from a document.
*
* Represents any link, reference, or resource pointer found during extraction.
* The `kind` field classifies the URI semantically, while `label` carries
* optional human-readable display text.
*/
final class ExtractedUri
{
public function __construct(
/** @var string The URL or path string. */
public readonly string $url,
/** @var UriKind Semantic classification of the URI. */
public readonly UriKind $kind,
/** @var ?string Optional display text / label for the link. */
public readonly ?string $label = null,
/** @var ?int Optional page number where the URI was found (1-indexed). */
public readonly ?int $page = null
) { }
}
/**
* MIME type detection response.
*/
final class DetectResponse
{
public function __construct(
/** @var string Detected MIME type */
public readonly string $mimeType,
/** @var ?string Original filename (if provided) */
public readonly ?string $filename = null
) { }
}
/**
* Options controlling how two `ExtractionResult` values are compared.
*/
final class DiffOptions
{
public function __construct(
/** @var bool Include metadata changes in the diff. Default: `true`. */
public readonly bool $includeMetadata,
/** @var bool Include embedded-children changes in the diff. Default: `true`. */
public readonly bool $includeEmbedded,
/**
* Truncate content to this many characters before diffing.
*
* Useful for very large documents where only the first N characters matter.
* `None` means no truncation.
*
* @var ?int
*/
public readonly ?int $maxContentChars = null
) { }
public static function default(): DiffOptions
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* The complete diff between two `ExtractionResult` values.
*/
final class ExtractionDiff
{
public function __construct(
/**
* Unified-diff hunks for the `content` field.
*
* Empty when the content is identical.
*
* @var array<DiffHunk>
*/
public readonly array $contentDiff,
/** @var array<Table> Tables present in `b` but not in `a` (by index position, excess right-side tables). */
public readonly array $tablesAdded,
/** @var array<Table> Tables present in `a` but not in `b` (by index position, excess left-side tables). */
public readonly array $tablesRemoved,
/** @var array<TableDiff> Cell-level changes for table pairs that share the same index and dimensions. */
public readonly array $tablesChanged,
/**
* Metadata difference, encoded as a JSON object with three top-level keys:
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
* but not `b`), and `changed` (keys whose values differ — each entry is
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
*
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
* preferred json-patch impl directly.
*
* @var string
*/
public readonly string $metadataChanged,
/** @var EmbeddedChanges Changes to embedded archive children. */
public readonly EmbeddedChanges $embeddedChanges
) { }
}
/**
* A single contiguous hunk in a unified diff.
*/
final class DiffHunk
{
public function __construct(
/** @var int Starting line number in the old content (0-indexed). */
public readonly int $fromLine,
/** @var int Number of lines from the old content in this hunk. */
public readonly int $fromCount,
/** @var int Starting line number in the new content (0-indexed). */
public readonly int $toLine,
/** @var int Number of lines from the new content in this hunk. */
public readonly int $toCount,
/** @var array<DiffLine> Lines that make up this hunk. */
public readonly array $lines
) { }
}
/**
* Cell-level changes for a pair of tables that share the same index.
*/
final class TableDiff
{
public function __construct(
/** @var int Zero-based index of the table in both `a.tables` and `b.tables`. */
public readonly int $fromIndex,
/** @var int Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
public readonly int $toIndex,
/** @var array<CellChange> Cell-level changes within the table. */
public readonly array $cellChanges
) { }
}
/**
* Changes to embedded archive children between two results.
*/
final class EmbeddedChanges
{
public function __construct(
/** @var array<ArchiveEntry> Children present in `b` but not in `a` (matched by `path`). */
public readonly array $added,
/** @var array<ArchiveEntry> Children present in `a` but not in `b` (matched by `path`). */
public readonly array $removed,
/**
* Children present in both but with differing content (matched by `path`).
*
* Each entry holds the diff of the nested `ExtractionResult`.
*
* @var array<EmbeddedDiff>
*/
public readonly array $changed
) { }
}
/**
* Diff for a single embedded archive entry that appears in both results.
*/
final class EmbeddedDiff
{
public function __construct(
/** @var string Archive-relative path identifying this entry. */
public readonly string $path,
/** @var ExtractionDiff The recursive diff of the entry's extraction result. */
public readonly ExtractionDiff $diff
) { }
}
/**
* Preset configurations for common RAG use cases.
*
* Each preset combines chunk size, overlap, and embedding model
* to provide an optimized configuration for specific scenarios.
*
* All string fields are owned `String` for FFI compatibility — instances
* are safe to clone and pass across language boundaries.
*/
final class EmbeddingPreset
{
public function __construct(
/** @var string */
public readonly string $name,
/** @var int */
public readonly int $chunkSize,
/** @var int */
public readonly int $overlap,
/** @var string HuggingFace repository name for the model. */
public readonly string $modelRepo,
/** @var string Pooling strategy: "cls" or "mean". */
public readonly string $pooling,
/** @var string Path to the ONNX model file within the repo. */
public readonly string $modelFile,
/** @var int */
public readonly int $dimensions,
/** @var string */
public readonly string $description
) { }
}
/**
* YAKE-specific parameters.
*/
final class YakeParams
{
public function __construct(
/**
* Window size for co-occurrence analysis (default: 2).
*
* Controls the context window for computing co-occurrence statistics.
*
* @var int
*/
public readonly int $windowSize
) { }
public static function default(): YakeParams
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* RAKE-specific parameters.
*/
final class RakeParams
{
public function __construct(
/** @var int Minimum word length to consider (default: 1). */
public readonly int $minWordLength,
/** @var int Maximum words in a keyword phrase (default: 3). */
public readonly int $maxWordsPerPhrase
) { }
public static function default(): RakeParams
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Keyword extraction configuration.
*/
final class KeywordConfig
{
public function __construct(
/** @var KeywordAlgorithm Algorithm to use for extraction. */
public readonly KeywordAlgorithm $algorithm,
/** @var int Maximum number of keywords to extract (default: 10). */
public readonly int $maxKeywords,
/**
* Minimum score threshold (0.0-1.0, default: 0.0).
*
* Keywords with scores below this threshold are filtered out.
* Note: Score ranges differ between algorithms.
*
* @var float
*/
public readonly float $minScore,
/**
* N-gram range for keyword extraction (min, max).
*
* (1, 1) = unigrams only
* (1, 2) = unigrams and bigrams
* (1, 3) = unigrams, bigrams, and trigrams (default)
*
* @var array<int>
*/
public readonly array $ngramRange,
/**
* Language code for stopword filtering (e.g., "en", "de", "fr").
*
* If None, no stopword filtering is applied.
*
* @var ?string
*/
public readonly ?string $language = null,
/** @var ?YakeParams YAKE-specific tuning parameters. */
public readonly ?YakeParams $yakeParams = null,
/** @var ?RakeParams RAKE-specific tuning parameters. */
public readonly ?RakeParams $rakeParams = null
) { }
public static function default(): KeywordConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Extracted keyword with metadata.
*/
final class Keyword
{
public function __construct(
/** @var string The keyword text. */
public readonly string $text,
/** @var float Relevance score (higher is better, algorithm-specific range). */
public readonly float $score,
/** @var KeywordAlgorithm Algorithm that extracted this keyword. */
public readonly KeywordAlgorithm $algorithm,
/** @var ?array<int> Optional positions where keyword appears in text (character offsets). */
public readonly ?array $positions = null
) { }
}
/**
* Configuration for PaddleOCR backend.
*
* Configures PaddleOCR text detection and recognition with multi-language support.
* Uses a builder pattern for convenient configuration.
*
* # Examples
*/
final class PaddleOcrConfig
{
public function __construct(
/** @var string Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
public readonly string $language,
/**
* Enable angle classification for rotated text (default: false).
* Can misfire on short text regions, rotating crops incorrectly before recognition.
*
* @var bool
*/
public readonly bool $useAngleCls,
/** @var bool Enable table structure detection (default: false) */
public readonly bool $enableTableDetection,
/**
* Database threshold for text detection (default: 0.3)
* Range: 0.0-1.0, higher values require more confident detections
*
* @var float
*/
public readonly float $detDbThresh,
/**
* Box threshold for text bounding box refinement (default: 0.5)
* Range: 0.0-1.0
*
* @var float
*/
public readonly float $detDbBoxThresh,
/**
* Unclip ratio for expanding text bounding boxes (default: 1.6)
* Controls the expansion of detected text regions
*
* @var float
*/
public readonly float $detDbUnclipRatio,
/**
* Maximum side length for detection image (default: 960)
* Larger images may be resized to this limit for faster inference
*
* @var int
*/
public readonly int $detLimitSideLen,
/**
* Batch size for recognition inference (default: 6)
* Number of text regions to process simultaneously
*
* @var int
*/
public readonly int $recBatchNum,
/**
* Padding in pixels added around the image before detection (default: 10).
* Large values can include surrounding content like table gridlines.
*
* @var int
*/
public readonly int $padding,
/**
* Minimum recognition confidence score for text lines (default: 0.5).
* Text regions with recognition confidence below this threshold are discarded.
* Matches PaddleOCR Python's `drop_score` parameter.
* Range: 0.0-1.0
*
* @var float
*/
public readonly float $dropScore,
/**
* Model tier controlling detection/recognition model size and accuracy trade-off.
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
*
* @var string
*/
public readonly string $modelTier,
/** @var ?string Optional custom cache directory for model files */
public readonly ?string $cacheDir = null
) { }
public function withCacheDir(string $path): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withTableDetection(bool $enable): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withAngleCls(bool $enable): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbThresh(float $threshold): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbBoxThresh(float $threshold): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetDbUnclipRatio(float $ratio): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDetLimitSideLen(int $length): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withRecBatchNum(int $batch_size): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withDropScore(float $score): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withPadding(int $padding): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public function withModelTier(string $tier): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
public static function default(): PaddleOcrConfig
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
}
/**
* Combined paths to all models needed for OCR (backward compatibility).
*/
final class ModelPaths
{
public function __construct(
/** @var string Path to the detection model directory. */
public readonly string $detModel,
/** @var string Path to the classification model directory. */
public readonly string $clsModel,
/** @var string Path to the recognition model directory. */
public readonly string $recModel,
/** @var string Path to the character dictionary file. */
public readonly string $dictFile
) { }
}
/**
* Document orientation detection result.
*/
final class OrientationResult
{
public function __construct(
/** @var int Detected orientation in degrees (0, 90, 180, or 270). */
public readonly int $degrees,
/** @var float Confidence score (0.0-1.0). */
public readonly float $confidence
) { }
}
/**
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
*/
final class BBox
{
public function __construct(
/** @var float */
public readonly float $x1,
/** @var float */
public readonly float $y1,
/** @var float */
public readonly float $x2,
/** @var float */
public readonly float $y2
) { }
}
/**
* A single layout detection result.
*/
final class LayoutDetection
{
public function __construct(
/** @var LayoutClass */
public readonly LayoutClass $className,
/** @var float */
public readonly float $confidence,
/** @var BBox */
public readonly BBox $bbox
) { }
}
/**
* Pre-computed table markdown for a table detection region.
*
* Produced by the TATR-based table structure recognizer and surfaced as part of
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
* so that consumers who do not enable `layout-detection` (ORT) can still reference
* the type in their own code.
*/
final class RecognizedTable
{
public function __construct(
/** @var BBox Detection bbox that this table corresponds to (for matching). */
public readonly BBox $detectionBbox,
/** @var array<array<string>> Table cells as a 2D vector (rows × columns). */
public readonly array $cells,
/** @var string Rendered markdown table. */
public readonly string $markdown
) { }
}
/**
* Page-level detection result containing all detections and page metadata.
*/
final class DetectionResult
{
public function __construct(
/** @var int */
public readonly int $pageWidth,
/** @var int */
public readonly int $pageHeight,
/** @var array<LayoutDetection> */
public readonly array $detections
) { }
}
/**
* Embedded file descriptor extracted from the PDF name tree.
*/
final class EmbeddedFile
{
public function __construct(
/** @var string The filename as stored in the PDF name tree. */
public readonly string $name,
/** @var string Raw file bytes from the embedded stream (already decompressed by lopdf). */
public readonly string $data,
/**
* Compressed byte count of the original stream (before decompression).
*
* Used by callers to compute the decompression ratio and detect zip-bomb-style
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
*
* @var int
*/
public readonly int $compressedSize,
/** @var ?string MIME type if specified in the filespec, otherwise `None`. */
public readonly ?string $mimeType = null
) { }
}
/**
* PDF-specific metadata.
*
* Contains metadata fields specific to PDF documents that are not in the common
* `Metadata` structure. Common fields like title, authors, keywords, and dates
* are at the `Metadata` level.
*/
final class PdfMetadata
{
public function __construct(
/** @var ?string PDF version (e.g., "1.7", "2.0") */
public readonly ?string $pdfVersion = null,
/** @var ?string PDF producer (application that created the PDF) */
public readonly ?string $producer = null,
/** @var ?bool Whether the PDF is encrypted/password-protected */
public readonly ?bool $isEncrypted = null,
/** @var ?int First page width in points (1/72 inch) */
public readonly ?int $width = null,
/** @var ?int First page height in points (1/72 inch) */
public readonly ?int $height = null,
/** @var ?int Total number of pages in the PDF document */
public readonly ?int $pageCount = null
) { }
}
enum ExecutionProviderType: string
{
case Auto = 'Auto';
case Cpu = 'Cpu';
case CoreMl = 'CoreMl';
case Cuda = 'Cuda';
case TensorRt = 'TensorRt';
}
enum OutputFormat: string
{
case Plain = 'Plain';
case Markdown = 'Markdown';
case Djot = 'Djot';
case Html = 'Html';
case Json = 'Json';
case Structured = 'Structured';
case Custom = 'Custom';
}
enum HtmlTheme: string
{
case Default = 'Default';
case GitHub = 'GitHub';
case Dark = 'Dark';
case Light = 'Light';
case Unstyled = 'Unstyled';
}
enum TableModel: string
{
case Tatr = 'tatr';
case SlanetWired = 'slanet_wired';
case SlanetWireless = 'slanet_wireless';
case SlanetPlus = 'slanet_plus';
case SlanetAuto = 'slanet_auto';
case Disabled = 'disabled';
}
enum ChunkerType: string
{
case Text = 'Text';
case Markdown = 'Markdown';
case Yaml = 'Yaml';
case Semantic = 'Semantic';
}
/**
* How chunk size is measured.
*
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
* chunks are sized by token count according to the specified tokenizer.
*
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
*/
final class ChunkSizing
{
}
/**
* Embedding model types supported by Kreuzberg.
*/
final class EmbeddingModelType
{
}
enum CodeContentMode: string
{
case Chunks = 'chunks';
case Raw = 'raw';
case Structure = 'structure';
}
enum ListType: string
{
case Bullet = 'Bullet';
case Numbered = 'Numbered';
case Lettered = 'Lettered';
case Indented = 'Indented';
}
enum OcrBackendType: string
{
case Tesseract = 'Tesseract';
case EasyOCR = 'EasyOCR';
case PaddleOCR = 'PaddleOCR';
case Custom = 'Custom';
}
enum ProcessingStage: string
{
case Early = 'Early';
case Middle = 'Middle';
case Late = 'Late';
}
enum ReductionLevel: string
{
case Off = 'Off';
case Light = 'Light';
case Moderate = 'Moderate';
case Aggressive = 'Aggressive';
case Maximum = 'Maximum';
}
enum PdfAnnotationType: string
{
case Text = 'text';
case Highlight = 'highlight';
case Link = 'link';
case Stamp = 'stamp';
case Underline = 'underline';
case StrikeOut = 'strike_out';
case Other = 'other';
}
enum BlockType: string
{
case Paragraph = 'paragraph';
case Heading = 'heading';
case Blockquote = 'blockquote';
case CodeBlock = 'code_block';
case ListItem = 'list_item';
case OrderedList = 'ordered_list';
case BulletList = 'bullet_list';
case TaskList = 'task_list';
case DefinitionList = 'definition_list';
case DefinitionTerm = 'definition_term';
case DefinitionDescription = 'definition_description';
case Div = 'div';
case Section = 'section';
case ThematicBreak = 'thematic_break';
case RawBlock = 'raw_block';
case MathDisplay = 'math_display';
}
enum InlineType: string
{
case Text = 'text';
case Strong = 'strong';
case Emphasis = 'emphasis';
case Highlight = 'highlight';
case Subscript = 'subscript';
case Superscript = 'superscript';
case Insert = 'insert';
case Delete = 'delete';
case Code = 'code';
case Link = 'link';
case Image = 'image';
case Span = 'span';
case Math = 'math';
case RawInline = 'raw_inline';
case FootnoteRef = 'footnote_ref';
case Symbol = 'symbol';
}
enum RelationshipKind: string
{
case FootnoteReference = 'footnote_reference';
case CitationReference = 'citation_reference';
case InternalLink = 'internal_link';
case Caption = 'caption';
case Label = 'label';
case TocEntry = 'toc_entry';
case CrossReference = 'cross_reference';
}
enum ContentLayer: string
{
case Body = 'body';
case Header = 'header';
case Footer = 'footer';
case Footnote = 'footnote';
}
/**
* Tagged enum for node content. Each variant carries only type-specific data.
*
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
* Go/Java/TypeScript bindings.
*/
final class NodeContent
{
}
/**
* Types of inline text annotations.
*/
final class AnnotationKind
{
}
enum ExtractionMethod: string
{
case Native = 'native';
case Ocr = 'ocr';
case Mixed = 'mixed';
}
enum ChunkType: string
{
case Heading = 'heading';
case PartyList = 'party_list';
case Definitions = 'definitions';
case OperativeClause = 'operative_clause';
case SignatureBlock = 'signature_block';
case Schedule = 'schedule';
case TableLike = 'table_like';
case Formula = 'formula';
case CodeBlock = 'code_block';
case Image = 'image';
case OrgChart = 'org_chart';
case Diagram = 'diagram';
case Unknown = 'unknown';
}
enum ImageKind: string
{
case Photograph = 'photograph';
case Diagram = 'diagram';
case Chart = 'chart';
case Drawing = 'drawing';
case TextBlock = 'text_block';
case Decoration = 'decoration';
case Logo = 'logo';
case Icon = 'icon';
case TileFragment = 'tile_fragment';
case Mask = 'mask';
case PageRaster = 'page_raster';
case Unknown = 'unknown';
}
enum ResultFormat: string
{
case Unified = 'unified';
case ElementBased = 'element_based';
}
enum ElementType: string
{
case Title = 'title';
case NarrativeText = 'narrative_text';
case Heading = 'heading';
case ListItem = 'list_item';
case Table = 'table';
case Image = 'image';
case PageBreak = 'page_break';
case CodeBlock = 'code_block';
case BlockQuote = 'block_quote';
case Footer = 'footer';
case Header = 'header';
}
/**
* Format-specific metadata (discriminated union).
*
* Only one format type can exist per extraction result. This provides
* type-safe, clean metadata without nested optionals.
*/
final class FormatMetadata
{
}
enum TextDirection: string
{
case LeftToRight = 'ltr';
case RightToLeft = 'rtl';
case Auto = 'auto';
}
enum LinkType: string
{
case Anchor = 'Anchor';
case Internal = 'Internal';
case External = 'External';
case Email = 'Email';
case Phone = 'Phone';
case Other = 'Other';
}
enum ImageType: string
{
case DataUri = 'data-uri';
case InlineSvg = 'inline-svg';
case External = 'External';
case Relative = 'Relative';
}
enum StructuredDataType: string
{
case JsonLd = 'json-ld';
case Microdata = 'Microdata';
case RDFa = 'rdfa';
}
/**
* Bounding geometry for an OCR element.
*
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
* (from PaddleOCR and rotated text detection).
*/
final class OcrBoundingGeometry
{
}
enum OcrElementLevel: string
{
case Word = 'word';
case Line = 'line';
case Block = 'block';
case Page = 'page';
}
enum PageUnitType: string
{
case Page = 'page';
case Slide = 'slide';
case Sheet = 'sheet';
}
/**
* A single line in a unified-diff hunk.
*
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
* reference it unconditionally, without requiring the `diff` Cargo feature.
* `crate.diff` re-exports this type verbatim.
*/
final class DiffLine
{
}
enum RevisionKind: string
{
case Insertion = 'insertion';
case Deletion = 'deletion';
case FormatChange = 'format_change';
case Comment = 'comment';
}
/**
* Best-effort document location for a revision.
*/
final class RevisionAnchor
{
}
enum UriKind: string
{
case Hyperlink = 'hyperlink';
case Image = 'image';
case Anchor = 'anchor';
case Citation = 'citation';
case Reference = 'reference';
case Email = 'email';
}
enum KeywordAlgorithm: string
{
case Yake = 'Yake';
case Rake = 'Rake';
}
enum PSMMode: string
{
case OsdOnly = 'OsdOnly';
case AutoOsd = 'AutoOsd';
case AutoOnly = 'AutoOnly';
case Auto = 'Auto';
case SingleColumn = 'SingleColumn';
case SingleBlockVertical = 'SingleBlockVertical';
case SingleBlock = 'SingleBlock';
case SingleLine = 'SingleLine';
case SingleWord = 'SingleWord';
case CircleWord = 'CircleWord';
case SingleChar = 'SingleChar';
}
enum PaddleLanguage: string
{
case English = 'English';
case Chinese = 'Chinese';
case Japanese = 'Japanese';
case Korean = 'Korean';
case German = 'German';
case French = 'French';
case Latin = 'Latin';
case Cyrillic = 'Cyrillic';
case TraditionalChinese = 'TraditionalChinese';
case Thai = 'Thai';
case Greek = 'Greek';
case EastSlavic = 'EastSlavic';
case Arabic = 'Arabic';
case Devanagari = 'Devanagari';
case Tamil = 'Tamil';
case Telugu = 'Telugu';
}
enum LayoutClass: string
{
case Caption = 'caption';
case Footnote = 'footnote';
case Formula = 'formula';
case ListItem = 'list_item';
case PageFooter = 'page_footer';
case PageHeader = 'page_header';
case Picture = 'picture';
case SectionHeader = 'section_header';
case Table = 'table';
case Text = 'text';
case Title = 'title';
case DocumentIndex = 'document_index';
case Code = 'code';
case CheckboxSelected = 'checkbox_selected';
case CheckboxUnselected = 'checkbox_unselected';
case Form = 'form';
case KeyValueRegion = 'key_value_region';
}
class KreuzbergApi
{
public static function extractBytes(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractFile(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractFileSync(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
public static function extractBytesSync(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchFileItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractFilesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchBytesItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractBytesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchFileItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractFiles(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<\Kreuzberg\BatchBytesItem> $items
* @param \Kreuzberg\ExtractionConfig $config
* @return array<\Kreuzberg\ExtractionResult>
*/
public static function batchExtractBytes(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function detectMimeTypeFromBytes(string $content): string { throw new \RuntimeException('Not implemented.'); }
/**
* @param string $mime_type
* @return array<string>
*/
public static function getExtensionsForMime(string $mime_type): array { throw new \RuntimeException('Not implemented.'); }
public static function clearEmbeddingBackends(): void { }
/**
* @return array<string>
*/
public static function listEmbeddingBackends(): array { throw new \RuntimeException('Not implemented.'); }
/**
* @return array<string>
*/
public static function listDocumentExtractors(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearDocumentExtractors(): void { }
/**
* @return array<string>
*/
public static function listOcrBackends(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearOcrBackends(): void { }
/**
* @return array<string>
*/
public static function listPostProcessors(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearPostProcessors(): void { }
/**
* @return array<string>
*/
public static function listRenderers(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearRenderers(): void { }
/**
* @return array<string>
*/
public static function listValidators(): array { throw new \RuntimeException('Not implemented.'); }
public static function clearValidators(): void { }
public static function compare(\Kreuzberg\ExtractionResult $a, \Kreuzberg\ExtractionResult $b, \Kreuzberg\DiffOptions $opts): \Kreuzberg\ExtractionDiff { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<string> $texts
* @param \Kreuzberg\EmbeddingConfig $config
* @return array<array<float>>
*/
public static function embedTextsAsync(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function renderPdfPageToPng(string $pdf_bytes, int $page_index, ?int $dpi = null, ?string $password = null): string { throw new \RuntimeException('Not implemented.'); }
public static function detectMimeType(string $path, bool $check_exists): string { throw new \RuntimeException('Not implemented.'); }
/**
* @param array<string> $texts
* @param \Kreuzberg\EmbeddingConfig $config
* @return array<array<float>>
*/
public static function embedTexts(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
public static function getEmbeddingPreset(string $name): ?\Kreuzberg\EmbeddingPreset { throw new \RuntimeException('Not implemented.'); }
/**
* @return array<string>
*/
public static function listEmbeddingPresets(): array { throw new \RuntimeException('Not implemented.'); }
public static function registerOcrBackend(\Kreuzberg\OcrBackend $backend): void { }
public static function unregisterOcrBackend(string $name): void { }
public static function clearOcrBackends(): void { }
public static function registerPostProcessor(\Kreuzberg\PostProcessor $backend): void { }
public static function unregisterPostProcessor(string $name): void { }
public static function clearPostProcessors(): void { }
public static function registerValidator(\Kreuzberg\Validator $backend): void { }
public static function unregisterValidator(string $name): void { }
public static function clearValidators(): void { }
public static function registerEmbeddingBackend(\Kreuzberg\EmbeddingBackend $backend): void { }
public static function unregisterEmbeddingBackend(string $name): void { }
public static function clearEmbeddingBackends(): void { }
public static function registerDocumentExtractor(\Kreuzberg\DocumentExtractor $backend): void { }
public static function unregisterDocumentExtractor(string $name): void { }
public static function clearDocumentExtractors(): void { }
public static function registerRenderer(\Kreuzberg\Renderer $backend): void { }
public static function unregisterRenderer(string $name): void { }
public static function clearRenderers(): void { }
}
} // end namespace