5397 lines
199 KiB
PHP
5397 lines
199 KiB
PHP
|
|
<?php
|
|||
|
|
|
|||
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
|||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|||
|
|
// To regenerate: alef generate
|
|||
|
|
// To verify freshness: alef verify --exit-code
|
|||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|||
|
|
// Type stubs for the native PHP extension — declares classes
|
|||
|
|
// provided at runtime by the compiled Rust extension (.so/.dll).
|
|||
|
|
// Include this in phpstan.neon scanFiles for static analysis.
|
|||
|
|
|
|||
|
|
declare(strict_types=1);
|
|||
|
|
|
|||
|
|
namespace Kreuzberg {
|
|||
|
|
class KreuzbergException extends \RuntimeException
|
|||
|
|
{
|
|||
|
|
public function getErrorCode(): int { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
final class CacheStats
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $totalFiles,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $totalSizeMb,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $availableSpaceMb,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $oldestFileAgeDays,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $newestFileAgeDays
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
*
|
|||
|
|
* Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
|||
|
|
* for inference in layout detection and embedding generation.
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*/
|
|||
|
|
final class AccelerationConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ExecutionProviderType Execution provider to use for ONNX inference. */
|
|||
|
|
public readonly ExecutionProviderType $provider,
|
|||
|
|
/** @var int GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto. */
|
|||
|
|
public readonly int $deviceId
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Cross-extractor content filtering configuration.
|
|||
|
|
*
|
|||
|
|
* Controls whether "furniture" content (headers, footers, page numbers,
|
|||
|
|
* watermarks, repeating text) is included in or stripped from extraction
|
|||
|
|
* results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
|||
|
|
* with format-specific implementation.
|
|||
|
|
*
|
|||
|
|
* When `None` on `ExtractionConfig`, each extractor uses its current
|
|||
|
|
* default behavior unchanged.
|
|||
|
|
*/
|
|||
|
|
final class ContentFilterConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Include running headers in extraction output.
|
|||
|
|
*
|
|||
|
|
* - PDF: Disables top-margin furniture stripping and prevents the layout
|
|||
|
|
* model from treating `PageHeader`-classified regions as furniture.
|
|||
|
|
* - DOCX: Includes document headers in text output.
|
|||
|
|
* - RTF/ODT: Headers already included; this is a no-op when true.
|
|||
|
|
* - HTML/EPUB: Keeps `<header>` element content.
|
|||
|
|
*
|
|||
|
|
* Default: `false` (headers are stripped or excluded).
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includeHeaders,
|
|||
|
|
/**
|
|||
|
|
* Include running footers in extraction output.
|
|||
|
|
*
|
|||
|
|
* - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
|||
|
|
* model from treating `PageFooter`-classified regions as furniture.
|
|||
|
|
* - DOCX: Includes document footers in text output.
|
|||
|
|
* - RTF/ODT: Footers already included; this is a no-op when true.
|
|||
|
|
* - HTML/EPUB: Keeps `<footer>` element content.
|
|||
|
|
*
|
|||
|
|
* Default: `false` (footers are stripped or excluded).
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includeFooters,
|
|||
|
|
/**
|
|||
|
|
* Enable the heuristic cross-page repeating text detector.
|
|||
|
|
*
|
|||
|
|
* When `true` (default), text that repeats verbatim across a supermajority
|
|||
|
|
* of pages is classified as furniture and stripped. Disable this if brand
|
|||
|
|
* names or repeated headings are being incorrectly removed by the heuristic.
|
|||
|
|
*
|
|||
|
|
* Note: when a layout-detection model is active, the model may independently
|
|||
|
|
* classify page-header / page-footer regions as furniture on a per-page basis.
|
|||
|
|
* To preserve those regions, set `include_headers = true`, `include_footers = true`,
|
|||
|
|
* or both, in addition to disabling this flag.
|
|||
|
|
*
|
|||
|
|
* Primarily affects PDF extraction.
|
|||
|
|
*
|
|||
|
|
* Default: `true`.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $stripRepeatingText,
|
|||
|
|
/**
|
|||
|
|
* Include watermark text in extraction output.
|
|||
|
|
*
|
|||
|
|
* - PDF: Keeps watermark artifacts and arXiv identifiers.
|
|||
|
|
* - Other formats: No effect currently.
|
|||
|
|
*
|
|||
|
|
* Default: `false` (watermarks are stripped).
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includeWatermarks
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ContentFilterConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for email extraction.
|
|||
|
|
*/
|
|||
|
|
final class EmailConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Windows codepage number to use when an MSG file contains no codepage property.
|
|||
|
|
* Defaults to `None`, which falls back to windows-1252.
|
|||
|
|
*
|
|||
|
|
* If an unrecognized or invalid codepage number is supplied (including 0),
|
|||
|
|
* the behavior silently falls back to windows-1252 — the same as when the
|
|||
|
|
* MSG file itself contains an unrecognized codepage. No error or warning is
|
|||
|
|
* emitted. Users should verify output when supplying unusual values.
|
|||
|
|
*
|
|||
|
|
* Common values:
|
|||
|
|
* - 1250: Central European (Polish, Czech, Hungarian, etc.)
|
|||
|
|
* - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
|
|||
|
|
* - 1252: Western European (default)
|
|||
|
|
* - 1253: Greek
|
|||
|
|
* - 1254: Turkish
|
|||
|
|
* - 1255: Hebrew
|
|||
|
|
* - 1256: Arabic
|
|||
|
|
* - 932: Japanese (Shift-JIS)
|
|||
|
|
* - 936: Simplified Chinese (GBK)
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $msgFallbackCodepage = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Main extraction configuration.
|
|||
|
|
*
|
|||
|
|
* This struct contains all configuration options for the extraction process.
|
|||
|
|
* It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*/
|
|||
|
|
final class ExtractionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Enable caching of extraction results */
|
|||
|
|
public readonly bool $useCache,
|
|||
|
|
/** @var bool Enable quality post-processing */
|
|||
|
|
public readonly bool $enableQualityProcessing,
|
|||
|
|
/** @var bool Force OCR even for searchable PDFs */
|
|||
|
|
public readonly bool $forceOcr,
|
|||
|
|
/**
|
|||
|
|
* Disable OCR entirely, even for images.
|
|||
|
|
*
|
|||
|
|
* When `true`, OCR is skipped for all document types. Images return metadata
|
|||
|
|
* only (dimensions, format, EXIF) without text extraction. PDFs use only
|
|||
|
|
* native text extraction without OCR fallback.
|
|||
|
|
*
|
|||
|
|
* Cannot be `true` simultaneously with `force_ocr`.
|
|||
|
|
*
|
|||
|
|
* *Added in v4.7.0.*
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $disableOcr,
|
|||
|
|
/**
|
|||
|
|
* Result structure format
|
|||
|
|
*
|
|||
|
|
* Controls whether results are returned in unified format (default) with all
|
|||
|
|
* content in the `content` field, or element-based format with semantic
|
|||
|
|
* elements (for Unstructured-compatible output).
|
|||
|
|
*
|
|||
|
|
* @var ResultFormat
|
|||
|
|
*/
|
|||
|
|
public readonly ResultFormat $resultFormat,
|
|||
|
|
/**
|
|||
|
|
* Content text format (default: Plain).
|
|||
|
|
*
|
|||
|
|
* Controls the format of the extracted content:
|
|||
|
|
* - `Plain`: Raw extracted text (default)
|
|||
|
|
* - `Markdown`: Markdown formatted output
|
|||
|
|
* - `Djot`: Djot markup format (requires djot feature)
|
|||
|
|
* - `Html`: HTML formatted output
|
|||
|
|
*
|
|||
|
|
* When set to a structured format, extraction results will include
|
|||
|
|
* formatted output. The `formatted_content` field may be populated
|
|||
|
|
* when format conversion is applied.
|
|||
|
|
*
|
|||
|
|
* @var OutputFormat
|
|||
|
|
*/
|
|||
|
|
public readonly OutputFormat $outputFormat,
|
|||
|
|
/**
|
|||
|
|
* Run layout detection on the non-OCR PDF markdown path.
|
|||
|
|
*
|
|||
|
|
* When `true` and `layout` is `Some(_)`, layout regions inform heading,
|
|||
|
|
* table, list, and figure detection in the structure pipeline that would
|
|||
|
|
* otherwise rely on font-clustering heuristics alone. Significantly
|
|||
|
|
* improves SF1 (structural F1) at the cost of inference latency
|
|||
|
|
* (~150-300ms/page CPU, ~20-50ms/page GPU). Default: `false`.
|
|||
|
|
* Requires the `layout-detection` feature.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $useLayoutForMarkdown,
|
|||
|
|
/**
|
|||
|
|
* Enable structured document tree output.
|
|||
|
|
*
|
|||
|
|
* When true, populates the `document` field on `ExtractionResult` with a
|
|||
|
|
* hierarchical `DocumentStructure` containing heading-driven section nesting,
|
|||
|
|
* table grids, content layer classification, and inline annotations.
|
|||
|
|
*
|
|||
|
|
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includeDocumentStructure,
|
|||
|
|
/**
|
|||
|
|
* Maximum recursion depth for archive extraction (default: 3).
|
|||
|
|
* Set to 0 to disable recursive extraction (legacy behavior).
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $maxArchiveDepth,
|
|||
|
|
/** @var ?OcrConfig OCR configuration (None = OCR disabled) */
|
|||
|
|
public readonly ?OcrConfig $ocr = null,
|
|||
|
|
/**
|
|||
|
|
* Force OCR on specific pages only (1-indexed page numbers, must be >= 1).
|
|||
|
|
*
|
|||
|
|
* When set, only the listed pages are OCR'd regardless of text layer quality.
|
|||
|
|
* Unlisted pages use native text extraction. Ignored when `force_ocr` is `true`.
|
|||
|
|
* Only applies to PDF documents. Duplicates are automatically deduplicated.
|
|||
|
|
* An `ocr` config is recommended for backend/language selection; defaults are used if absent.
|
|||
|
|
*
|
|||
|
|
* @var ?array<int>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $forceOcrPages = null,
|
|||
|
|
/** @var ?ChunkingConfig Text chunking configuration (None = chunking disabled) */
|
|||
|
|
public readonly ?ChunkingConfig $chunking = null,
|
|||
|
|
/**
|
|||
|
|
* Content filtering configuration (None = use extractor defaults).
|
|||
|
|
*
|
|||
|
|
* Controls whether document "furniture" (headers, footers, watermarks,
|
|||
|
|
* repeating text) is included in or stripped from extraction results.
|
|||
|
|
* See [`ContentFilterConfig`] for per-field documentation.
|
|||
|
|
*
|
|||
|
|
* @var ?ContentFilterConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?ContentFilterConfig $contentFilter = null,
|
|||
|
|
/** @var ?ImageExtractionConfig Image extraction configuration (None = no image extraction) */
|
|||
|
|
public readonly ?ImageExtractionConfig $images = null,
|
|||
|
|
/** @var ?PdfConfig PDF-specific options (None = use defaults) */
|
|||
|
|
public readonly ?PdfConfig $pdfOptions = null,
|
|||
|
|
/** @var ?TokenReductionOptions Token reduction configuration (None = no token reduction) */
|
|||
|
|
public readonly ?TokenReductionOptions $tokenReduction = null,
|
|||
|
|
/** @var ?LanguageDetectionConfig Language detection configuration (None = no language detection) */
|
|||
|
|
public readonly ?LanguageDetectionConfig $languageDetection = null,
|
|||
|
|
/** @var ?PageConfig Page extraction configuration (None = no page tracking) */
|
|||
|
|
public readonly ?PageConfig $pages = null,
|
|||
|
|
/** @var ?KeywordConfig Keyword extraction configuration (None = no keyword extraction) */
|
|||
|
|
public readonly ?KeywordConfig $keywords = null,
|
|||
|
|
/** @var ?PostProcessorConfig Post-processor configuration (None = use defaults) */
|
|||
|
|
public readonly ?PostProcessorConfig $postprocessor = null,
|
|||
|
|
/**
|
|||
|
|
* HTML to Markdown conversion options (None = use defaults)
|
|||
|
|
*
|
|||
|
|
* Configure how HTML documents are converted to Markdown, including heading styles,
|
|||
|
|
* list formatting, code block styles, and preprocessing options.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $htmlOptions = null,
|
|||
|
|
/**
|
|||
|
|
* Styled HTML output configuration.
|
|||
|
|
*
|
|||
|
|
* When set alongside `output_format = OutputFormat::Html`, the extraction
|
|||
|
|
* pipeline uses [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer)
|
|||
|
|
* which emits stable `kb-*` CSS class hooks on every structural element
|
|||
|
|
* and optionally embeds theme CSS or user-supplied CSS in a `<style>` block.
|
|||
|
|
*
|
|||
|
|
* When `None`, the existing plain comrak-based HTML renderer is used.
|
|||
|
|
*
|
|||
|
|
* @var ?HtmlOutputConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?HtmlOutputConfig $htmlOutput = null,
|
|||
|
|
/**
|
|||
|
|
* Default per-file timeout in seconds for batch extraction.
|
|||
|
|
*
|
|||
|
|
* When set, each file in a batch will be canceled after this duration
|
|||
|
|
* unless overridden by [`FileExtractionConfig::timeout_secs`].
|
|||
|
|
*
|
|||
|
|
* Defaults to `Some(60)` to prevent pathological files (e.g. deeply
|
|||
|
|
* nested archives, documents with millions of cells) from running
|
|||
|
|
* indefinitely and exhausting caller resources. Set to `None` to
|
|||
|
|
* disable the timeout for trusted input or long-running workloads.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $extractionTimeoutSecs = null,
|
|||
|
|
/**
|
|||
|
|
* Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
|
|||
|
|
*
|
|||
|
|
* Limits parallelism to prevent resource exhaustion when processing
|
|||
|
|
* large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $maxConcurrentExtractions = null,
|
|||
|
|
/**
|
|||
|
|
* Security limits for archive extraction.
|
|||
|
|
*
|
|||
|
|
* Controls maximum archive size, compression ratio, file count, and other
|
|||
|
|
* security thresholds to prevent decompression bomb attacks. Also caps
|
|||
|
|
* nesting depth, iteration count, entity / token length, total
|
|||
|
|
* content size, and table cell count for every extraction path that
|
|||
|
|
* ingests user-controlled bytes.
|
|||
|
|
* When `None`, default limits are used.
|
|||
|
|
*
|
|||
|
|
* @var ?SecurityLimits
|
|||
|
|
*/
|
|||
|
|
public readonly ?SecurityLimits $securityLimits = null,
|
|||
|
|
/**
|
|||
|
|
* Maximum uncompressed size in bytes for a single embedded file before
|
|||
|
|
* recursive extraction is attempted (default: 50 MiB).
|
|||
|
|
*
|
|||
|
|
* Applies to embedded objects inside OOXML containers (DOCX, PPTX) and
|
|||
|
|
* to email attachments processed via recursive extraction. Files that
|
|||
|
|
* exceed this limit are skipped with a `ProcessingWarning` rather than
|
|||
|
|
* passed to the extraction pipeline, preventing a single oversized
|
|||
|
|
* embedded object from consuming unbounded memory or time.
|
|||
|
|
*
|
|||
|
|
* Set to `None` to disable the per-embedded-file cap (falls back to
|
|||
|
|
* `security_limits.max_archive_size` as the only guard).
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $maxEmbeddedFileBytes = null,
|
|||
|
|
/**
|
|||
|
|
* Layout detection configuration (None = layout detection disabled).
|
|||
|
|
*
|
|||
|
|
* When set, PDF pages and images are analyzed for document structure
|
|||
|
|
* (headings, code, formulas, tables, figures, etc.) using RT-DETR models
|
|||
|
|
* via ONNX Runtime. For PDFs, layout hints override paragraph classification
|
|||
|
|
* in the markdown pipeline. For images, per-region OCR is performed with
|
|||
|
|
* markdown formatting based on detected layout classes.
|
|||
|
|
* Requires the `layout-detection` feature to run inference; the field is
|
|||
|
|
* present whenever the `layout-types` feature is active (which includes
|
|||
|
|
* `layout-detection` as well as the no-ORT target groups).
|
|||
|
|
*
|
|||
|
|
* @var ?LayoutDetectionConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?LayoutDetectionConfig $layout = null,
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration configuration for ONNX Runtime models.
|
|||
|
|
*
|
|||
|
|
* Controls execution provider selection for layout detection and embedding
|
|||
|
|
* models. When `None`, uses platform defaults (CoreML on macOS, CUDA on
|
|||
|
|
* Linux, CPU on Windows).
|
|||
|
|
*
|
|||
|
|
* @var ?AccelerationConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?AccelerationConfig $acceleration = null,
|
|||
|
|
/**
|
|||
|
|
* Cache namespace for tenant isolation.
|
|||
|
|
*
|
|||
|
|
* When set, cache entries are stored under `{cache_dir}/{namespace}/`.
|
|||
|
|
* Must be alphanumeric, hyphens, or underscores only (max 64 chars).
|
|||
|
|
* Different namespaces have isolated cache spaces on the same filesystem.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $cacheNamespace = null,
|
|||
|
|
/**
|
|||
|
|
* Per-request cache TTL in seconds.
|
|||
|
|
*
|
|||
|
|
* Overrides the global `max_age_days` for this specific extraction.
|
|||
|
|
* When `0`, caching is completely skipped (no read or write).
|
|||
|
|
* When `None`, the global TTL applies.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $cacheTtlSecs = null,
|
|||
|
|
/**
|
|||
|
|
* Email extraction configuration (None = use defaults).
|
|||
|
|
*
|
|||
|
|
* Currently supports configuring the fallback codepage for MSG files
|
|||
|
|
* that do not specify one. See `EmailConfig` for details.
|
|||
|
|
*
|
|||
|
|
* @var ?EmailConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?EmailConfig $email = null,
|
|||
|
|
/**
|
|||
|
|
* Concurrency limits for constrained environments (None = use defaults).
|
|||
|
|
*
|
|||
|
|
* Controls Rayon thread pool size, ONNX Runtime intra-op threads, and
|
|||
|
|
* (when `max_concurrent_extractions` is unset) the batch concurrency
|
|||
|
|
* semaphore. See `ConcurrencyConfig` for details.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $concurrency = null,
|
|||
|
|
/**
|
|||
|
|
* Tree-sitter language pack configuration (None = tree-sitter disabled).
|
|||
|
|
*
|
|||
|
|
* When set, enables code file extraction using tree-sitter parsers.
|
|||
|
|
* Controls grammar download behavior and code analysis options.
|
|||
|
|
*
|
|||
|
|
* @var ?TreeSitterConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?TreeSitterConfig $treeSitter = null,
|
|||
|
|
/**
|
|||
|
|
* Structured extraction via LLM (None = disabled).
|
|||
|
|
*
|
|||
|
|
* When set, the extracted document content is sent to an LLM with the
|
|||
|
|
* provided JSON schema. The structured response is stored in
|
|||
|
|
* `ExtractionResult::structured_output`.
|
|||
|
|
*
|
|||
|
|
* @var ?StructuredExtractionConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?StructuredExtractionConfig $structuredExtraction = null,
|
|||
|
|
/**
|
|||
|
|
* Cancellation token for this extraction (None = no external cancellation).
|
|||
|
|
*
|
|||
|
|
* Pass a [`CancellationToken`] clone here and call [`CancellationToken::cancel`]
|
|||
|
|
* from another thread / task to abort the extraction in progress. The extractor
|
|||
|
|
* checks the token at safe checkpoints (before lock acquisition, between pages,
|
|||
|
|
* between batch items) and returns [`KreuzbergError::Cancelled`] when set.
|
|||
|
|
*
|
|||
|
|
* The field is excluded from serialization because `CancellationToken` is a
|
|||
|
|
* runtime handle, not a configuration value.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $cancelToken = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ExtractionConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function needsImageProcessing(): bool
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Per-file extraction configuration overrides for batch processing.
|
|||
|
|
*
|
|||
|
|
* All fields are `Option<T>` — `None` means "use the batch-level default."
|
|||
|
|
* This type is used with `batch_extract_files` and
|
|||
|
|
* `batch_extract_bytes` to allow heterogeneous
|
|||
|
|
* extraction settings within a single batch.
|
|||
|
|
*
|
|||
|
|
* # Excluded Fields
|
|||
|
|
*
|
|||
|
|
* The following `ExtractionConfig` fields are batch-level only and
|
|||
|
|
* cannot be overridden per file:
|
|||
|
|
* - `max_concurrent_extractions` — controls batch parallelism
|
|||
|
|
* - `use_cache` — global caching policy
|
|||
|
|
* - `acceleration` — shared ONNX execution provider
|
|||
|
|
* - `security_limits` — global archive security policy
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*/
|
|||
|
|
final class FileExtractionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?bool Override quality post-processing for this file. */
|
|||
|
|
public readonly ?bool $enableQualityProcessing = null,
|
|||
|
|
/** @var ?OcrConfig Override OCR configuration for this file (None in the Option = use batch default). */
|
|||
|
|
public readonly ?OcrConfig $ocr = null,
|
|||
|
|
/** @var ?bool Override force OCR for this file. */
|
|||
|
|
public readonly ?bool $forceOcr = null,
|
|||
|
|
/** @var ?array<int> Override force OCR pages for this file (1-indexed page numbers). */
|
|||
|
|
public readonly ?array $forceOcrPages = null,
|
|||
|
|
/** @var ?bool Override disable OCR for this file. */
|
|||
|
|
public readonly ?bool $disableOcr = null,
|
|||
|
|
/** @var ?ChunkingConfig Override chunking configuration for this file. */
|
|||
|
|
public readonly ?ChunkingConfig $chunking = null,
|
|||
|
|
/** @var ?ContentFilterConfig Override content filtering configuration for this file. */
|
|||
|
|
public readonly ?ContentFilterConfig $contentFilter = null,
|
|||
|
|
/** @var ?ImageExtractionConfig Override image extraction configuration for this file. */
|
|||
|
|
public readonly ?ImageExtractionConfig $images = null,
|
|||
|
|
/** @var ?PdfConfig Override PDF options for this file. */
|
|||
|
|
public readonly ?PdfConfig $pdfOptions = null,
|
|||
|
|
/** @var ?TokenReductionOptions Override token reduction for this file. */
|
|||
|
|
public readonly ?TokenReductionOptions $tokenReduction = null,
|
|||
|
|
/** @var ?LanguageDetectionConfig Override language detection for this file. */
|
|||
|
|
public readonly ?LanguageDetectionConfig $languageDetection = null,
|
|||
|
|
/** @var ?PageConfig Override page extraction for this file. */
|
|||
|
|
public readonly ?PageConfig $pages = null,
|
|||
|
|
/** @var ?KeywordConfig Override keyword extraction for this file. */
|
|||
|
|
public readonly ?KeywordConfig $keywords = null,
|
|||
|
|
/** @var ?PostProcessorConfig Override post-processor for this file. */
|
|||
|
|
public readonly ?PostProcessorConfig $postprocessor = null,
|
|||
|
|
/** @var ?string Override HTML conversion options for this file. */
|
|||
|
|
public readonly ?string $htmlOptions = null,
|
|||
|
|
/** @var ?ResultFormat Override result format for this file. */
|
|||
|
|
public readonly ?ResultFormat $resultFormat = null,
|
|||
|
|
/** @var ?OutputFormat Override output content format for this file. */
|
|||
|
|
public readonly ?OutputFormat $outputFormat = null,
|
|||
|
|
/** @var ?bool Override document structure output for this file. */
|
|||
|
|
public readonly ?bool $includeDocumentStructure = null,
|
|||
|
|
/** @var ?LayoutDetectionConfig Override layout detection for this file. */
|
|||
|
|
public readonly ?LayoutDetectionConfig $layout = null,
|
|||
|
|
/**
|
|||
|
|
* Override per-file extraction timeout in seconds.
|
|||
|
|
*
|
|||
|
|
* When set, the extraction for this file will be canceled after the
|
|||
|
|
* specified duration. A timed-out file produces an error result without
|
|||
|
|
* affecting other files in the batch.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $timeoutSecs = null,
|
|||
|
|
/** @var ?TreeSitterConfig Override tree-sitter configuration for this file. */
|
|||
|
|
public readonly ?TreeSitterConfig $treeSitter = null,
|
|||
|
|
/**
|
|||
|
|
* Override structured extraction configuration for this file.
|
|||
|
|
*
|
|||
|
|
* When set, enables LLM-based structured extraction with a JSON schema
|
|||
|
|
* for this specific file. The extracted content is sent to a VLM/LLM
|
|||
|
|
* and the response is parsed according to the provided schema.
|
|||
|
|
*
|
|||
|
|
* @var ?StructuredExtractionConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?StructuredExtractionConfig $structuredExtraction = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Batch item for byte array extraction.
|
|||
|
|
*
|
|||
|
|
* Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
|||
|
|
* to represent a single item in a batch extraction job.
|
|||
|
|
*/
|
|||
|
|
final class BatchBytesItem
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The content bytes to extract from */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var string MIME type of the content (e.g., "application/pdf", "text/html") */
|
|||
|
|
public readonly string $mimeType,
|
|||
|
|
/** @var ?FileExtractionConfig Per-item configuration overrides (None uses batch-level defaults) */
|
|||
|
|
public readonly ?FileExtractionConfig $config = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Batch item for file extraction.
|
|||
|
|
*
|
|||
|
|
* Used with `batch_extract_files` and `batch_extract_files_sync`
|
|||
|
|
* to represent a single file in a batch extraction job.
|
|||
|
|
*/
|
|||
|
|
final class BatchFileItem
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Path to the file to extract from */
|
|||
|
|
public readonly string $path,
|
|||
|
|
/** @var ?FileExtractionConfig Per-file configuration overrides (None uses batch-level defaults) */
|
|||
|
|
public readonly ?FileExtractionConfig $config = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image extraction configuration.
|
|||
|
|
*/
|
|||
|
|
final class ImageExtractionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Extract images from documents */
|
|||
|
|
public readonly bool $extractImages,
|
|||
|
|
/** @var int Target DPI for image normalization */
|
|||
|
|
public readonly int $targetDpi,
|
|||
|
|
/** @var int Maximum dimension for images (width or height) */
|
|||
|
|
public readonly int $maxImageDimension,
|
|||
|
|
/**
|
|||
|
|
* Whether to inject image reference placeholders into markdown output.
|
|||
|
|
* When `true` (default), image references like ``
|
|||
|
|
* are appended to the markdown. Set to `false` to extract images as data
|
|||
|
|
* without polluting the markdown output.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $injectPlaceholders,
|
|||
|
|
/** @var bool Automatically adjust DPI based on image content */
|
|||
|
|
public readonly bool $autoAdjustDpi,
|
|||
|
|
/** @var int Minimum DPI threshold */
|
|||
|
|
public readonly int $minDpi,
|
|||
|
|
/** @var int Maximum DPI threshold */
|
|||
|
|
public readonly int $maxDpi,
|
|||
|
|
/**
|
|||
|
|
* When `true` (default), extracted images are classified by kind and grouped
|
|||
|
|
* into clusters where they appear to belong to one figure.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $classify,
|
|||
|
|
/**
|
|||
|
|
* When `true`, full-page renders produced during OCR preprocessing are captured
|
|||
|
|
* and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
|
|||
|
|
*
|
|||
|
|
* **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
|||
|
|
* document-level OCR bypass is active (whole-document backend). When OCR is
|
|||
|
|
* enabled and this flag is set but the active backend skips per-page rendering,
|
|||
|
|
* a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
|
|||
|
|
*
|
|||
|
|
* Defaults to `false`. Enable when downstream consumers need page thumbnails
|
|||
|
|
* (e.g. citation previews, visual grounding).
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includePageRasters,
|
|||
|
|
/**
|
|||
|
|
* Run OCR on extracted images and include the recognized text in the document content.
|
|||
|
|
*
|
|||
|
|
* When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
|
|||
|
|
* are processed with the configured OCR backend. Set to `false` to extract images
|
|||
|
|
* without OCR processing, even when OCR is enabled.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $runOcrOnImages,
|
|||
|
|
/**
|
|||
|
|
* When `true`, image OCR results are rendered as plain text without the
|
|||
|
|
* `` markdown placeholder. Only takes effect when `run_ocr_on_images`
|
|||
|
|
* is also `true`.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $ocrTextOnly,
|
|||
|
|
/**
|
|||
|
|
* When `true` and `ocr_text_only` is `false`, append the OCR text after
|
|||
|
|
* the image placeholder in the rendered output.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $appendOcrText,
|
|||
|
|
/**
|
|||
|
|
* Maximum number of image objects to extract per PDF page.
|
|||
|
|
*
|
|||
|
|
* Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
|||
|
|
* can trigger extremely long or indefinite extraction times when every image
|
|||
|
|
* object on a dense page is decoded individually via the PDF extractor. Setting this
|
|||
|
|
* limit causes kreuzberg to stop collecting individual images once the count
|
|||
|
|
* per page reaches the cap and emit a warning instead.
|
|||
|
|
*
|
|||
|
|
* `None` (default) means no limit — all images are extracted.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $maxImagesPerPage = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ImageExtractionConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Token reduction configuration.
|
|||
|
|
*/
|
|||
|
|
final class TokenReductionOptions
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Reduction mode: "off", "light", "moderate", "aggressive", "maximum" */
|
|||
|
|
public readonly string $mode,
|
|||
|
|
/** @var bool Preserve important words (capitalized, technical terms) */
|
|||
|
|
public readonly bool $preserveImportantWords
|
|||
|
|
) { }
|
|||
|
|
public static function default(): TokenReductionOptions
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Language detection configuration.
|
|||
|
|
*/
|
|||
|
|
final class LanguageDetectionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Enable language detection */
|
|||
|
|
public readonly bool $enabled,
|
|||
|
|
/** @var float Minimum confidence threshold (0.0-1.0) */
|
|||
|
|
public readonly float $minConfidence,
|
|||
|
|
/** @var bool Detect multiple languages in the document */
|
|||
|
|
public readonly bool $detectMultiple
|
|||
|
|
) { }
|
|||
|
|
public static function default(): LanguageDetectionConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for styled HTML output.
|
|||
|
|
*
|
|||
|
|
* When set on `ExtractionConfig.html_output` alongside
|
|||
|
|
* `output_format = OutputFormat.Html`, the pipeline builds a
|
|||
|
|
* `StyledHtmlRenderer`(crate.rendering.StyledHtmlRenderer) instead of
|
|||
|
|
* the plain comrak-based renderer.
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*/
|
|||
|
|
final class HtmlOutputConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var HtmlTheme Built-in colour/typography theme. Default: [`HtmlTheme::Unstyled`]. */
|
|||
|
|
public readonly HtmlTheme $theme,
|
|||
|
|
/**
|
|||
|
|
* CSS class prefix applied to every emitted class name.
|
|||
|
|
*
|
|||
|
|
* Default: `"kb-"`. Change this if your host application already uses
|
|||
|
|
* classes that start with `kb-`.
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $classPrefix,
|
|||
|
|
/**
|
|||
|
|
* When `true` (default), write the resolved CSS into a `<style>` block
|
|||
|
|
* immediately after the opening `<div class="{prefix}doc">`.
|
|||
|
|
*
|
|||
|
|
* Set to `false` to emit only the structural markup and wire up your
|
|||
|
|
* own stylesheet targeting the `kb-*` class names.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $embedCss,
|
|||
|
|
/**
|
|||
|
|
* Inline CSS string injected into the output after the theme stylesheet.
|
|||
|
|
* Concatenated after `css_file` content when both are set.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $css = null,
|
|||
|
|
/**
|
|||
|
|
* Path to a CSS file loaded once at renderer construction time.
|
|||
|
|
* Concatenated before `css` when both are set.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $cssFile = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): HtmlOutputConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Layout detection configuration.
|
|||
|
|
*
|
|||
|
|
* Controls layout detection behavior in the extraction pipeline.
|
|||
|
|
* When set on `ExtractionConfig`(super.ExtractionConfig), layout detection
|
|||
|
|
* is enabled for PDF extraction.
|
|||
|
|
*/
|
|||
|
|
final class LayoutDetectionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Whether to apply postprocessing heuristics (default: true). */
|
|||
|
|
public readonly bool $applyHeuristics,
|
|||
|
|
/**
|
|||
|
|
* Table structure recognition model.
|
|||
|
|
*
|
|||
|
|
* Controls which model is used for table cell detection within layout-detected
|
|||
|
|
* table regions. Defaults to [`TableModel::Tatr`].
|
|||
|
|
*
|
|||
|
|
* @var TableModel
|
|||
|
|
*/
|
|||
|
|
public readonly TableModel $tableModel,
|
|||
|
|
/** @var ?float Confidence threshold override (None = use model default). */
|
|||
|
|
public readonly ?float $confidenceThreshold = null,
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration for ONNX models (layout detection + table structure).
|
|||
|
|
*
|
|||
|
|
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
|||
|
|
* is used for inference. Defaults to `None` (auto-select per platform).
|
|||
|
|
*
|
|||
|
|
* @var ?AccelerationConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?AccelerationConfig $acceleration = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): LayoutDetectionConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for an LLM provider/model via liter-llm.
|
|||
|
|
*
|
|||
|
|
* Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
|||
|
|
* its own `LlmConfig`, allowing different providers per feature.
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*
|
|||
|
|
* ```toml
|
|||
|
|
* [structured_extraction.llm]
|
|||
|
|
* model = "openai/gpt-4o"
|
|||
|
|
* api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
|
|||
|
|
* ```
|
|||
|
|
*/
|
|||
|
|
final class LlmConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Provider/model string using liter-llm routing format.
|
|||
|
|
*
|
|||
|
|
* Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
|
|||
|
|
* `"groq/llama-3.1-70b-versatile"`.
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $model,
|
|||
|
|
/**
|
|||
|
|
* API key for the provider. When `None`, liter-llm falls back to
|
|||
|
|
* the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $apiKey = null,
|
|||
|
|
/** @var ?string Custom base URL override for the provider endpoint. */
|
|||
|
|
public readonly ?string $baseUrl = null,
|
|||
|
|
/** @var ?int Request timeout in seconds (default: 60). */
|
|||
|
|
public readonly ?int $timeoutSecs = null,
|
|||
|
|
/** @var ?int Maximum retry attempts (default: 3). */
|
|||
|
|
public readonly ?int $maxRetries = null,
|
|||
|
|
/** @var ?float Sampling temperature for generation tasks. */
|
|||
|
|
public readonly ?float $temperature = null,
|
|||
|
|
/** @var ?int Maximum tokens to generate. */
|
|||
|
|
public readonly ?int $maxTokens = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for LLM-based structured data extraction.
|
|||
|
|
*
|
|||
|
|
* Sends extracted document content to a VLM with a JSON schema,
|
|||
|
|
* returning structured data that conforms to the schema.
|
|||
|
|
*
|
|||
|
|
* # Example
|
|||
|
|
*
|
|||
|
|
* ```toml
|
|||
|
|
* [structured_extraction]
|
|||
|
|
* schema_name = "invoice_data"
|
|||
|
|
* strict = true
|
|||
|
|
*
|
|||
|
|
* [structured_extraction.schema]
|
|||
|
|
* type = "object"
|
|||
|
|
* properties.vendor = { type = "string" }
|
|||
|
|
* properties.total = { type = "number" }
|
|||
|
|
* required = ["vendor", "total"]
|
|||
|
|
*
|
|||
|
|
* [structured_extraction.llm]
|
|||
|
|
* model = "openai/gpt-4o"
|
|||
|
|
* ```
|
|||
|
|
*/
|
|||
|
|
final class StructuredExtractionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string JSON Schema defining the desired output structure. */
|
|||
|
|
public readonly string $schema,
|
|||
|
|
/** @var string Schema name passed to the LLM's structured output mode. */
|
|||
|
|
public readonly string $schemaName,
|
|||
|
|
/** @var bool Enable strict mode — output must exactly match the schema. */
|
|||
|
|
public readonly bool $strict,
|
|||
|
|
/** @var LlmConfig LLM configuration for the extraction. */
|
|||
|
|
public readonly LlmConfig $llm,
|
|||
|
|
/** @var ?string Optional schema description for the LLM. */
|
|||
|
|
public readonly ?string $schemaDescription = null,
|
|||
|
|
/**
|
|||
|
|
* Custom Jinja2 extraction prompt template. When `None`, a default template is used.
|
|||
|
|
*
|
|||
|
|
* Available template variables:
|
|||
|
|
* - `{{ content }}` — The extracted document text.
|
|||
|
|
* - `{{ schema }}` — The JSON schema as a formatted string.
|
|||
|
|
* - `{{ schema_name }}` — The schema name.
|
|||
|
|
* - `{{ schema_description }}` — The schema description (may be empty).
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $prompt = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Quality thresholds for OCR fallback decisions and pipeline quality gating.
|
|||
|
|
*
|
|||
|
|
* All fields default to the values that match the previous hardcoded behavior,
|
|||
|
|
* so `OcrQualityThresholds.default()` preserves existing semantics exactly.
|
|||
|
|
*/
|
|||
|
|
final class OcrQualityThresholds
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Minimum total non-whitespace characters to consider text substantive. */
|
|||
|
|
public readonly int $minTotalNonWhitespace,
|
|||
|
|
/** @var float Minimum non-whitespace characters per page on average. */
|
|||
|
|
public readonly float $minNonWhitespacePerPage,
|
|||
|
|
/** @var int Minimum character count for a word to be "meaningful". */
|
|||
|
|
public readonly int $minMeaningfulWordLen,
|
|||
|
|
/** @var int Minimum count of meaningful words before text is accepted. */
|
|||
|
|
public readonly int $minMeaningfulWords,
|
|||
|
|
/** @var float Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric). */
|
|||
|
|
public readonly float $minAlnumRatio,
|
|||
|
|
/** @var int Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback. */
|
|||
|
|
public readonly int $minGarbageChars,
|
|||
|
|
/** @var float Maximum fraction of short (1-2 char) words before text is considered fragmented. */
|
|||
|
|
public readonly float $maxFragmentedWordRatio,
|
|||
|
|
/**
|
|||
|
|
* Critical fragmentation threshold — triggers OCR regardless of meaningful words.
|
|||
|
|
* Normal English text has ~20-30% short words. 80%+ is definitive garbage.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $criticalFragmentedWordRatio,
|
|||
|
|
/** @var float Minimum average word length. Below this with enough words indicates garbled extraction. */
|
|||
|
|
public readonly float $minAvgWordLength,
|
|||
|
|
/** @var int Minimum word count before average word length check applies. */
|
|||
|
|
public readonly int $minWordsForAvgLengthCheck,
|
|||
|
|
/** @var float Minimum consecutive word repetition ratio to detect column scrambling. */
|
|||
|
|
public readonly float $minConsecutiveRepeatRatio,
|
|||
|
|
/** @var int Minimum word count before consecutive repetition check is applied. */
|
|||
|
|
public readonly int $minWordsForRepeatCheck,
|
|||
|
|
/** @var int Minimum character count for "substantive markdown" OCR skip gate. */
|
|||
|
|
public readonly int $substantiveMinChars,
|
|||
|
|
/** @var int Minimum character count for "non-text content" OCR skip gate. */
|
|||
|
|
public readonly int $nonTextMinChars,
|
|||
|
|
/** @var float Alphanumeric+whitespace ratio threshold for skip decisions. */
|
|||
|
|
public readonly float $alnumWsRatioThreshold,
|
|||
|
|
/**
|
|||
|
|
* Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
|
|||
|
|
* If the result from a backend scores below this, try the next backend.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $pipelineMinQuality
|
|||
|
|
) { }
|
|||
|
|
public static function default(): OcrQualityThresholds
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single backend stage in the OCR pipeline.
|
|||
|
|
*/
|
|||
|
|
final class OcrPipelineStage
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Backend name: "tesseract", "paddleocr", "easyocr", or a custom registered name. */
|
|||
|
|
public readonly string $backend,
|
|||
|
|
/** @var int Priority weight (higher = tried first). Stages are sorted by priority descending. */
|
|||
|
|
public readonly int $priority,
|
|||
|
|
/** @var ?string Language override for this stage (None = use parent OcrConfig.language). */
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?TesseractConfig Tesseract-specific config override for this stage. */
|
|||
|
|
public readonly ?TesseractConfig $tesseractConfig = null,
|
|||
|
|
/** @var ?string PaddleOCR-specific config for this stage. */
|
|||
|
|
public readonly ?string $paddleOcrConfig = null,
|
|||
|
|
/** @var ?LlmConfig VLM config override for this pipeline stage. */
|
|||
|
|
public readonly ?LlmConfig $vlmConfig = null,
|
|||
|
|
/**
|
|||
|
|
* Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
*
|
|||
|
|
* Backends that support runtime tuning (mode switching, preprocessing
|
|||
|
|
* flags, inference parameters, etc.) read this value and deserialize
|
|||
|
|
* the keys they care about. Keys unknown to the backend are silently
|
|||
|
|
* ignored, so options from different backends can coexist in the same
|
|||
|
|
* config without conflict.
|
|||
|
|
*
|
|||
|
|
* Example (custom backend):
|
|||
|
|
* ```json
|
|||
|
|
* { "mode": "fast", "enable_layout": true }
|
|||
|
|
* ```
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $backendOptions = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Multi-backend OCR pipeline with quality-based fallback.
|
|||
|
|
*
|
|||
|
|
* Backends are tried in priority order (highest first). After each backend
|
|||
|
|
* produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
|
|||
|
|
* the result is accepted. Otherwise the next backend is tried.
|
|||
|
|
*/
|
|||
|
|
final class OcrPipelineConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<OcrPipelineStage> Ordered list of backends to try. Sorted by priority (descending) at runtime. */
|
|||
|
|
public readonly array $stages,
|
|||
|
|
/** @var OcrQualityThresholds Quality thresholds for deciding whether to accept a result or try the next backend. */
|
|||
|
|
public readonly OcrQualityThresholds $qualityThresholds
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* OCR configuration.
|
|||
|
|
*/
|
|||
|
|
final class OcrConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Whether OCR is enabled.
|
|||
|
|
*
|
|||
|
|
* Setting `enabled: false` is a shorthand for `disable_ocr: true` on the parent
|
|||
|
|
* [`ExtractionConfig`](crate::core::config::ExtractionConfig). Images return
|
|||
|
|
* metadata only; PDFs use native text extraction without OCR fallback.
|
|||
|
|
*
|
|||
|
|
* Defaults to `true`. When `false`, all other OCR settings are ignored.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $enabled,
|
|||
|
|
/** @var string OCR backend: tesseract, easyocr, paddleocr */
|
|||
|
|
public readonly string $backend,
|
|||
|
|
/** @var string Language code (e.g., "eng", "deu") */
|
|||
|
|
public readonly string $language,
|
|||
|
|
/**
|
|||
|
|
* Enable automatic page rotation based on orientation detection.
|
|||
|
|
*
|
|||
|
|
* When enabled, uses Tesseract's `DetectOrientationScript()` to detect
|
|||
|
|
* page orientation (0/90/180/270 degrees) before OCR. If the page is
|
|||
|
|
* rotated with high confidence, the image is corrected before recognition.
|
|||
|
|
* This is critical for handling rotated scanned documents.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $autoRotate,
|
|||
|
|
/** @var ?TesseractConfig Tesseract-specific configuration (optional) */
|
|||
|
|
public readonly ?TesseractConfig $tesseractConfig = null,
|
|||
|
|
/** @var ?OutputFormat Output format for OCR results (optional, for format conversion) */
|
|||
|
|
public readonly ?OutputFormat $outputFormat = null,
|
|||
|
|
/** @var ?string PaddleOCR-specific configuration (optional, JSON passthrough) */
|
|||
|
|
public readonly ?string $paddleOcrConfig = null,
|
|||
|
|
/**
|
|||
|
|
* Arbitrary per-call options passed through to the backend unchanged.
|
|||
|
|
*
|
|||
|
|
* Custom OCR backends and built-in backends that support runtime tuning
|
|||
|
|
* can read this value and deserialize the keys they care about. Keys
|
|||
|
|
* unknown to the backend are silently ignored.
|
|||
|
|
*
|
|||
|
|
* This is the recommended extension point for per-call parameters that
|
|||
|
|
* are not covered by the typed fields above (e.g. mode switching,
|
|||
|
|
* preprocessing flags, inference batch size).
|
|||
|
|
*
|
|||
|
|
* **Scope:** when `pipeline` is `None`, this value is propagated to the
|
|||
|
|
* primary stage of the auto-constructed pipeline. When `pipeline` is
|
|||
|
|
* explicitly set, this field has **no effect** — the caller must set
|
|||
|
|
* `OcrPipelineStage.backend_options` directly on the relevant stage(s)
|
|||
|
|
* instead.
|
|||
|
|
*
|
|||
|
|
* Example:
|
|||
|
|
* ```json
|
|||
|
|
* { "mode": "fast", "enable_layout": true, "timeout_ms": 5000 }
|
|||
|
|
* ```
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $backendOptions = null,
|
|||
|
|
/** @var ?OcrElementConfig OCR element extraction configuration */
|
|||
|
|
public readonly ?OcrElementConfig $elementConfig = null,
|
|||
|
|
/**
|
|||
|
|
* Quality thresholds for the native-text-to-OCR fallback decision.
|
|||
|
|
* When None, uses compiled defaults (matching previous hardcoded behavior).
|
|||
|
|
*
|
|||
|
|
* @var ?OcrQualityThresholds
|
|||
|
|
*/
|
|||
|
|
public readonly ?OcrQualityThresholds $qualityThresholds = null,
|
|||
|
|
/**
|
|||
|
|
* Multi-backend OCR pipeline configuration. When set, enables weighted
|
|||
|
|
* fallback across multiple OCR backends based on output quality.
|
|||
|
|
* When None, uses the single `backend` field (same as today).
|
|||
|
|
*
|
|||
|
|
* @var ?OcrPipelineConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?OcrPipelineConfig $pipeline = null,
|
|||
|
|
/**
|
|||
|
|
* VLM (Vision Language Model) OCR configuration.
|
|||
|
|
*
|
|||
|
|
* Required when `backend` is `"vlm"`. Uses liter-llm to send page
|
|||
|
|
* images to a vision model for text extraction.
|
|||
|
|
*
|
|||
|
|
* @var ?LlmConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?LlmConfig $vlmConfig = null,
|
|||
|
|
/**
|
|||
|
|
* Custom Jinja2 prompt template for VLM OCR.
|
|||
|
|
*
|
|||
|
|
* When `None`, uses the default template. Available variables:
|
|||
|
|
* - `{{ language }}` — The document language code (e.g., "eng", "deu").
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $vlmPrompt = null,
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration for ONNX Runtime models (e.g. PaddleOCR, layout detection).
|
|||
|
|
*
|
|||
|
|
* Not user-configurable via config files — injected at runtime from
|
|||
|
|
* `ExtractionConfig::acceleration` before each `process_image` call.
|
|||
|
|
*
|
|||
|
|
* @var ?AccelerationConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?AccelerationConfig $acceleration = null,
|
|||
|
|
/**
|
|||
|
|
* Caller-supplied Tesseract `traineddata` bytes per language code.
|
|||
|
|
*
|
|||
|
|
* Primary use case is the WASM build, which has no filesystem and cannot
|
|||
|
|
* download tessdata at runtime. Native builds typically rely on
|
|||
|
|
* `TessdataManager` and ignore this field. When present, the WASM
|
|||
|
|
* Tesseract backend prefers these bytes over its compile-time-bundled
|
|||
|
|
* English data.
|
|||
|
|
*
|
|||
|
|
* Skipped by serde to keep config files small — supply via the typed API
|
|||
|
|
* at runtime.
|
|||
|
|
*
|
|||
|
|
* @var ?array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $tessdataBytes = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): OcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Page extraction and tracking configuration.
|
|||
|
|
*
|
|||
|
|
* Controls how pages are extracted, tracked, and represented in the extraction results.
|
|||
|
|
* When `None`, page tracking is disabled.
|
|||
|
|
*
|
|||
|
|
* Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
|||
|
|
* when page boundaries are available and chunking is configured.
|
|||
|
|
*/
|
|||
|
|
final class PageConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Extract pages as separate array (ExtractionResult.pages) */
|
|||
|
|
public readonly bool $extractPages,
|
|||
|
|
/** @var bool Insert page markers in main content string */
|
|||
|
|
public readonly bool $insertPageMarkers,
|
|||
|
|
/**
|
|||
|
|
* Page marker format (use {page_num} placeholder)
|
|||
|
|
* Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $markerFormat
|
|||
|
|
) { }
|
|||
|
|
public static function default(): PageConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* PDF-specific configuration.
|
|||
|
|
*/
|
|||
|
|
final class PdfConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Extract images from PDF */
|
|||
|
|
public readonly bool $extractImages,
|
|||
|
|
/**
|
|||
|
|
* Extract tables from PDF.
|
|||
|
|
*
|
|||
|
|
* When `true` (default), runs pdf_oxide's native grid detector and, if it
|
|||
|
|
* finds nothing, falls back to the heuristic text-layer reconstruction in
|
|||
|
|
* `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
|
|||
|
|
* both passes — `tables` will then be empty in the result.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $extractTables,
|
|||
|
|
/** @var bool Extract PDF metadata */
|
|||
|
|
public readonly bool $extractMetadata,
|
|||
|
|
/**
|
|||
|
|
* Extract PDF annotations (text notes, highlights, links, stamps).
|
|||
|
|
* Default: false
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $extractAnnotations,
|
|||
|
|
/**
|
|||
|
|
* Allow single-column pseudo tables in extraction results.
|
|||
|
|
*
|
|||
|
|
* By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
|||
|
|
* (heuristic) are rejected. When `true`, the minimum column count is relaxed
|
|||
|
|
* to 1, allowing single-column structured data (glossaries, itemized lists)
|
|||
|
|
* to be emitted as tables. Other quality filters (density, sparsity, prose
|
|||
|
|
* detection) still apply.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $allowSingleColumnTables,
|
|||
|
|
/**
|
|||
|
|
* Perform OCR on inline images extracted from PDF pages and attach the
|
|||
|
|
* recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
|
|||
|
|
* to be available; if `ExtractionConfig.ocr` is `None` the extractor
|
|||
|
|
* falls back to `TesseractConfig::default()`. Per-image failures degrade
|
|||
|
|
* gracefully (the image is returned without OCR text rather than failing
|
|||
|
|
* the whole extraction). Default: `false`.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $ocrInlineImages,
|
|||
|
|
/** @var ?array<string> List of passwords to try when opening encrypted PDFs */
|
|||
|
|
public readonly ?array $passwords = null,
|
|||
|
|
/** @var ?HierarchyConfig Hierarchy extraction configuration (None = hierarchy extraction disabled) */
|
|||
|
|
public readonly ?HierarchyConfig $hierarchy = null,
|
|||
|
|
/**
|
|||
|
|
* Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
|||
|
|
* Default: 0.06 (6%)
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $topMarginFraction = null,
|
|||
|
|
/**
|
|||
|
|
* Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
|||
|
|
* Default: 0.05 (5%)
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $bottomMarginFraction = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): PdfConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Hierarchy extraction configuration for PDF text structure analysis.
|
|||
|
|
*
|
|||
|
|
* Enables extraction of document hierarchy levels (H1-H6) based on font size
|
|||
|
|
* clustering and semantic analysis. When enabled, hierarchical blocks are
|
|||
|
|
* included in page content.
|
|||
|
|
*/
|
|||
|
|
final class HierarchyConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Enable hierarchy extraction */
|
|||
|
|
public readonly bool $enabled,
|
|||
|
|
/**
|
|||
|
|
* Number of font size clusters to use for hierarchy levels (1-7)
|
|||
|
|
*
|
|||
|
|
* Default: 6, which provides H1-H6 heading levels with body text.
|
|||
|
|
* Larger values create more fine-grained hierarchy levels.
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $kClusters,
|
|||
|
|
/** @var bool Include bounding box information in hierarchy blocks */
|
|||
|
|
public readonly bool $includeBbox,
|
|||
|
|
/**
|
|||
|
|
* OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
|||
|
|
*
|
|||
|
|
* Determines when OCR should be triggered based on text block coverage.
|
|||
|
|
* OCR is triggered when text blocks cover less than this fraction of the page.
|
|||
|
|
* Default: 0.5 (trigger OCR if less than 50% of page has text)
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $ocrCoverageThreshold = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): HierarchyConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Post-processor configuration.
|
|||
|
|
*/
|
|||
|
|
final class PostProcessorConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Enable post-processors */
|
|||
|
|
public readonly bool $enabled,
|
|||
|
|
/** @var ?array<string> Whitelist of processor names to run (None = all enabled) */
|
|||
|
|
public readonly ?array $enabledProcessors = null,
|
|||
|
|
/** @var ?array<string> Blacklist of processor names to skip (None = none disabled) */
|
|||
|
|
public readonly ?array $disabledProcessors = null,
|
|||
|
|
/** @var ?array<string> Pre-computed AHashSet for O(1) enabled processor lookup */
|
|||
|
|
public readonly ?array $enabledSet = null,
|
|||
|
|
/** @var ?array<string> Pre-computed AHashSet for O(1) disabled processor lookup */
|
|||
|
|
public readonly ?array $disabledSet = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): PostProcessorConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Chunking configuration.
|
|||
|
|
*
|
|||
|
|
* Configures text chunking for document content, including chunk size,
|
|||
|
|
* overlap, trimming behavior, and optional embeddings.
|
|||
|
|
*
|
|||
|
|
* Use `..Default.default()` when constructing to allow for future field additions:
|
|||
|
|
*/
|
|||
|
|
final class ChunkingConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Maximum size per chunk (in units determined by `sizing`).
|
|||
|
|
*
|
|||
|
|
* When `sizing` is `Characters` (default), this is the max character count.
|
|||
|
|
* When using token-based sizing, this is the max token count.
|
|||
|
|
*
|
|||
|
|
* Default: 1000
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $maxCharacters,
|
|||
|
|
/**
|
|||
|
|
* Overlap between chunks (in units determined by `sizing`).
|
|||
|
|
*
|
|||
|
|
* Default: 200
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $overlap,
|
|||
|
|
/**
|
|||
|
|
* Whether to trim whitespace from chunk boundaries.
|
|||
|
|
*
|
|||
|
|
* Default: true
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $trim,
|
|||
|
|
/**
|
|||
|
|
* Type of chunker to use (Text or Markdown).
|
|||
|
|
*
|
|||
|
|
* Default: Text
|
|||
|
|
*
|
|||
|
|
* @var ChunkerType
|
|||
|
|
*/
|
|||
|
|
public readonly ChunkerType $chunkerType,
|
|||
|
|
/**
|
|||
|
|
* How to measure chunk size.
|
|||
|
|
*
|
|||
|
|
* Default: `Characters` (Unicode character count).
|
|||
|
|
* Enable `chunking-tiktoken` or `chunking-tokenizers` features for token-based sizing.
|
|||
|
|
*
|
|||
|
|
* @var ChunkSizing
|
|||
|
|
*/
|
|||
|
|
public readonly ChunkSizing $sizing,
|
|||
|
|
/**
|
|||
|
|
* When `true` and `chunker_type` is `Markdown`, prepend the heading hierarchy
|
|||
|
|
* path (e.g. `"# Title > ## Section\n\n"`) to each chunk's content string.
|
|||
|
|
*
|
|||
|
|
* This is useful for RAG pipelines where each chunk needs self-contained
|
|||
|
|
* context about its position in the document structure.
|
|||
|
|
*
|
|||
|
|
* Default: `false`
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $prependHeadingContext,
|
|||
|
|
/** @var ?EmbeddingConfig Optional embedding configuration for chunk embeddings. */
|
|||
|
|
public readonly ?EmbeddingConfig $embedding = null,
|
|||
|
|
/** @var ?string Use a preset configuration (overrides individual settings if provided). */
|
|||
|
|
public readonly ?string $preset = null,
|
|||
|
|
/**
|
|||
|
|
* Optional cosine similarity threshold for semantic topic boundary detection.
|
|||
|
|
*
|
|||
|
|
* Only used when `chunker_type` is `Semantic` and an `EmbeddingConfig` is
|
|||
|
|
* provided. You almost never need to set this. When omitted, defaults to
|
|||
|
|
* `0.75` which works well for most documents. Lower values detect more
|
|||
|
|
* topic boundaries (more, smaller chunks); higher values detect fewer.
|
|||
|
|
* Range: `0.0..=1.0`.
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $topicThreshold = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ChunkingConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Embedding configuration for text chunks.
|
|||
|
|
*
|
|||
|
|
* Configures embedding generation using ONNX models via the vendored embedding engine.
|
|||
|
|
* Requires the `embeddings` feature to be enabled.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddingConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var EmbeddingModelType The embedding model to use (defaults to "balanced" preset if not specified) */
|
|||
|
|
public readonly EmbeddingModelType $model,
|
|||
|
|
/** @var bool Whether to normalize embedding vectors (recommended for cosine similarity) */
|
|||
|
|
public readonly bool $normalize,
|
|||
|
|
/** @var int Batch size for embedding generation */
|
|||
|
|
public readonly int $batchSize,
|
|||
|
|
/** @var bool Show model download progress */
|
|||
|
|
public readonly bool $showDownloadProgress,
|
|||
|
|
/**
|
|||
|
|
* Custom cache directory for model files
|
|||
|
|
*
|
|||
|
|
* Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
|
|||
|
|
* Allows full customization of model download location.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $cacheDir = null,
|
|||
|
|
/**
|
|||
|
|
* Hardware acceleration for the embedding ONNX model.
|
|||
|
|
*
|
|||
|
|
* When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
|||
|
|
* is used for inference. Defaults to `None` (auto-select per platform).
|
|||
|
|
*
|
|||
|
|
* @var ?AccelerationConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?AccelerationConfig $acceleration = null,
|
|||
|
|
/**
|
|||
|
|
* Maximum wall-clock duration (in seconds) for a single `embed()` call when
|
|||
|
|
* using [`EmbeddingModelType::Plugin`].
|
|||
|
|
*
|
|||
|
|
* Applies only to the in-process plugin path — protects against hung
|
|||
|
|
* host-language backends (e.g. a Python callback deadlocked on the GIL,
|
|||
|
|
* a model stuck on CUDA OOM retries, etc.). On timeout, the dispatcher
|
|||
|
|
* returns `Plugin` instead of blocking forever.
|
|||
|
|
*
|
|||
|
|
* `None` disables the timeout. The default (60 seconds) is conservative
|
|||
|
|
* for common in-process inference; increase for large batches on slow
|
|||
|
|
* hardware.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $maxEmbedDurationSecs = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): EmbeddingConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for tree-sitter language pack integration.
|
|||
|
|
*
|
|||
|
|
* Controls grammar download behavior and code analysis options.
|
|||
|
|
*
|
|||
|
|
* # Example (TOML)
|
|||
|
|
*
|
|||
|
|
* ```toml
|
|||
|
|
* [tree_sitter]
|
|||
|
|
* languages = ["python", "rust"]
|
|||
|
|
* groups = ["web"]
|
|||
|
|
*
|
|||
|
|
* [tree_sitter.process]
|
|||
|
|
* structure = true
|
|||
|
|
* comments = true
|
|||
|
|
* docstrings = true
|
|||
|
|
* ```
|
|||
|
|
*/
|
|||
|
|
final class TreeSitterConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Enable code intelligence processing (default: true).
|
|||
|
|
*
|
|||
|
|
* When `false`, tree-sitter analysis is completely skipped even if
|
|||
|
|
* the config section is present.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $enabled,
|
|||
|
|
/** @var TreeSitterProcessConfig Processing options for code analysis. */
|
|||
|
|
public readonly TreeSitterProcessConfig $process,
|
|||
|
|
/**
|
|||
|
|
* Custom cache directory for downloaded grammars.
|
|||
|
|
*
|
|||
|
|
* When `None`, uses the default: `~/.cache/tree-sitter-language-pack/v{version}/libs/`.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $cacheDir = null,
|
|||
|
|
/** @var ?array<string> Languages to pre-download on init (e.g., `["python", "rust"]`). */
|
|||
|
|
public readonly ?array $languages = null,
|
|||
|
|
/** @var ?array<string> Language groups to pre-download (e.g., `["web", "systems", "scripting"]`). */
|
|||
|
|
public readonly ?array $groups = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): TreeSitterConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Processing options for tree-sitter code analysis.
|
|||
|
|
*
|
|||
|
|
* Controls which analysis features are enabled when extracting code files.
|
|||
|
|
*/
|
|||
|
|
final class TreeSitterProcessConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Extract structural items (functions, classes, structs, etc.). Default: true. */
|
|||
|
|
public readonly bool $structure,
|
|||
|
|
/** @var bool Extract import statements. Default: true. */
|
|||
|
|
public readonly bool $imports,
|
|||
|
|
/** @var bool Extract export statements. Default: true. */
|
|||
|
|
public readonly bool $exports,
|
|||
|
|
/** @var bool Extract comments. Default: false. */
|
|||
|
|
public readonly bool $comments,
|
|||
|
|
/** @var bool Extract docstrings. Default: false. */
|
|||
|
|
public readonly bool $docstrings,
|
|||
|
|
/** @var bool Extract symbol definitions. Default: false. */
|
|||
|
|
public readonly bool $symbols,
|
|||
|
|
/** @var bool Include parse diagnostics. Default: false. */
|
|||
|
|
public readonly bool $diagnostics,
|
|||
|
|
/** @var CodeContentMode Content rendering mode for code extraction. */
|
|||
|
|
public readonly CodeContentMode $contentMode,
|
|||
|
|
/** @var ?int Maximum chunk size in bytes. `None` disables chunking. */
|
|||
|
|
public readonly ?int $chunkMaxSize = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): TreeSitterProcessConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A supported document format entry.
|
|||
|
|
*
|
|||
|
|
* Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
|||
|
|
*/
|
|||
|
|
final class SupportedFormat
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string File extension (without leading dot), e.g., "pdf", "docx" */
|
|||
|
|
public readonly string $extension,
|
|||
|
|
/** @var string MIME type string, e.g., "application/pdf" */
|
|||
|
|
public readonly string $mimeType
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* API server configuration.
|
|||
|
|
*
|
|||
|
|
* This struct holds all configuration options for the Kreuzberg API server,
|
|||
|
|
* including host/port settings, CORS configuration, and upload limits.
|
|||
|
|
*
|
|||
|
|
* # Defaults
|
|||
|
|
*
|
|||
|
|
* - `host`: "127.0.0.1" (localhost only)
|
|||
|
|
* - `port`: 8000
|
|||
|
|
* - `cors_origins`: empty vector (allows all origins)
|
|||
|
|
* - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|||
|
|
* - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|||
|
|
*/
|
|||
|
|
final class ServerConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Server host address (e.g., "127.0.0.1", "0.0.0.0") */
|
|||
|
|
public readonly string $host,
|
|||
|
|
/** @var int Server port number */
|
|||
|
|
public readonly int $port,
|
|||
|
|
/**
|
|||
|
|
* CORS allowed origins. Empty vector means allow all origins.
|
|||
|
|
*
|
|||
|
|
* If this is an empty vector, the server will accept requests from any origin.
|
|||
|
|
* If populated with specific origins (e.g., `"https://example.com"`), only
|
|||
|
|
* those origins will be allowed.
|
|||
|
|
*
|
|||
|
|
* @var array<string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $corsOrigins,
|
|||
|
|
/** @var int Maximum size of request body in bytes (default: 100 MB) */
|
|||
|
|
public readonly int $maxRequestBodyBytes,
|
|||
|
|
/** @var int Maximum size of multipart fields in bytes (default: 100 MB) */
|
|||
|
|
public readonly int $maxMultipartFieldBytes
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ServerConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function listenAddr(): string
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function corsAllowsAll(): bool
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function isOriginAllowed(string $origin): bool
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function maxRequestBodyMb(): int
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function maxMultipartFieldMb(): int
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
final class StructuredDataResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $format,
|
|||
|
|
/** @var array<string, string> */
|
|||
|
|
public readonly array $metadata,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $textFields
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Application properties from docProps/app.xml for DOCX
|
|||
|
|
*
|
|||
|
|
* Contains Word-specific document statistics and metadata.
|
|||
|
|
*/
|
|||
|
|
final class DocxAppProperties
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?string Application name (e.g., "Microsoft Office Word") */
|
|||
|
|
public readonly ?string $application = null,
|
|||
|
|
/** @var ?string Application version */
|
|||
|
|
public readonly ?string $appVersion = null,
|
|||
|
|
/** @var ?string Template filename */
|
|||
|
|
public readonly ?string $template = null,
|
|||
|
|
/** @var ?int Total editing time in minutes */
|
|||
|
|
public readonly ?int $totalTime = null,
|
|||
|
|
/** @var ?int Number of pages */
|
|||
|
|
public readonly ?int $pages = null,
|
|||
|
|
/** @var ?int Number of words */
|
|||
|
|
public readonly ?int $words = null,
|
|||
|
|
/** @var ?int Number of characters (excluding spaces) */
|
|||
|
|
public readonly ?int $characters = null,
|
|||
|
|
/** @var ?int Number of characters (including spaces) */
|
|||
|
|
public readonly ?int $charactersWithSpaces = null,
|
|||
|
|
/** @var ?int Number of lines */
|
|||
|
|
public readonly ?int $lines = null,
|
|||
|
|
/** @var ?int Number of paragraphs */
|
|||
|
|
public readonly ?int $paragraphs = null,
|
|||
|
|
/** @var ?string Company name */
|
|||
|
|
public readonly ?string $company = null,
|
|||
|
|
/** @var ?int Document security level */
|
|||
|
|
public readonly ?int $docSecurity = null,
|
|||
|
|
/** @var ?bool Scale crop flag */
|
|||
|
|
public readonly ?bool $scaleCrop = null,
|
|||
|
|
/** @var ?bool Links up to date flag */
|
|||
|
|
public readonly ?bool $linksUpToDate = null,
|
|||
|
|
/** @var ?bool Shared document flag */
|
|||
|
|
public readonly ?bool $sharedDoc = null,
|
|||
|
|
/** @var ?bool Hyperlinks changed flag */
|
|||
|
|
public readonly ?bool $hyperlinksChanged = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Application properties from docProps/app.xml for XLSX
|
|||
|
|
*
|
|||
|
|
* Contains Excel-specific document metadata.
|
|||
|
|
*/
|
|||
|
|
final class XlsxAppProperties
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> Worksheet names */
|
|||
|
|
public readonly array $worksheetNames,
|
|||
|
|
/** @var ?string Application name (e.g., "Microsoft Excel") */
|
|||
|
|
public readonly ?string $application = null,
|
|||
|
|
/** @var ?string Application version */
|
|||
|
|
public readonly ?string $appVersion = null,
|
|||
|
|
/** @var ?int Document security level */
|
|||
|
|
public readonly ?int $docSecurity = null,
|
|||
|
|
/** @var ?bool Scale crop flag */
|
|||
|
|
public readonly ?bool $scaleCrop = null,
|
|||
|
|
/** @var ?bool Links up to date flag */
|
|||
|
|
public readonly ?bool $linksUpToDate = null,
|
|||
|
|
/** @var ?bool Shared document flag */
|
|||
|
|
public readonly ?bool $sharedDoc = null,
|
|||
|
|
/** @var ?bool Hyperlinks changed flag */
|
|||
|
|
public readonly ?bool $hyperlinksChanged = null,
|
|||
|
|
/** @var ?string Company name */
|
|||
|
|
public readonly ?string $company = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Application properties from docProps/app.xml for PPTX
|
|||
|
|
*
|
|||
|
|
* Contains PowerPoint-specific document metadata.
|
|||
|
|
*/
|
|||
|
|
final class PptxAppProperties
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> Slide titles */
|
|||
|
|
public readonly array $slideTitles,
|
|||
|
|
/** @var ?string Application name (e.g., "Microsoft Office PowerPoint") */
|
|||
|
|
public readonly ?string $application = null,
|
|||
|
|
/** @var ?string Application version */
|
|||
|
|
public readonly ?string $appVersion = null,
|
|||
|
|
/** @var ?int Total editing time in minutes */
|
|||
|
|
public readonly ?int $totalTime = null,
|
|||
|
|
/** @var ?string Company name */
|
|||
|
|
public readonly ?string $company = null,
|
|||
|
|
/** @var ?int Document security level */
|
|||
|
|
public readonly ?int $docSecurity = null,
|
|||
|
|
/** @var ?bool Scale crop flag */
|
|||
|
|
public readonly ?bool $scaleCrop = null,
|
|||
|
|
/** @var ?bool Links up to date flag */
|
|||
|
|
public readonly ?bool $linksUpToDate = null,
|
|||
|
|
/** @var ?bool Shared document flag */
|
|||
|
|
public readonly ?bool $sharedDoc = null,
|
|||
|
|
/** @var ?bool Hyperlinks changed flag */
|
|||
|
|
public readonly ?bool $hyperlinksChanged = null,
|
|||
|
|
/** @var ?int Number of slides */
|
|||
|
|
public readonly ?int $slides = null,
|
|||
|
|
/** @var ?int Number of notes */
|
|||
|
|
public readonly ?int $notes = null,
|
|||
|
|
/** @var ?int Number of hidden slides */
|
|||
|
|
public readonly ?int $hiddenSlides = null,
|
|||
|
|
/** @var ?int Number of multimedia clips */
|
|||
|
|
public readonly ?int $multimediaClips = null,
|
|||
|
|
/** @var ?string Presentation format (e.g., "Widescreen", "Standard") */
|
|||
|
|
public readonly ?string $presentationFormat = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Dublin Core metadata from docProps/core.xml
|
|||
|
|
*
|
|||
|
|
* Contains standard metadata fields defined by the Dublin Core standard
|
|||
|
|
* and Office-specific extensions.
|
|||
|
|
*/
|
|||
|
|
final class CoreProperties
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?string Document title */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?string Document subject/topic */
|
|||
|
|
public readonly ?string $subject = null,
|
|||
|
|
/** @var ?string Document creator/author */
|
|||
|
|
public readonly ?string $creator = null,
|
|||
|
|
/** @var ?string Keywords or tags */
|
|||
|
|
public readonly ?string $keywords = null,
|
|||
|
|
/** @var ?string Document description/abstract */
|
|||
|
|
public readonly ?string $description = null,
|
|||
|
|
/** @var ?string User who last modified the document */
|
|||
|
|
public readonly ?string $lastModifiedBy = null,
|
|||
|
|
/** @var ?string Revision number */
|
|||
|
|
public readonly ?string $revision = null,
|
|||
|
|
/** @var ?string Creation timestamp (ISO 8601) */
|
|||
|
|
public readonly ?string $created = null,
|
|||
|
|
/** @var ?string Last modification timestamp (ISO 8601) */
|
|||
|
|
public readonly ?string $modified = null,
|
|||
|
|
/** @var ?string Document category */
|
|||
|
|
public readonly ?string $category = null,
|
|||
|
|
/** @var ?string Content status (Draft, Final, etc.) */
|
|||
|
|
public readonly ?string $contentStatus = null,
|
|||
|
|
/** @var ?string Document language */
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?string Unique identifier */
|
|||
|
|
public readonly ?string $identifier = null,
|
|||
|
|
/** @var ?string Document version */
|
|||
|
|
public readonly ?string $version = null,
|
|||
|
|
/** @var ?string Last print timestamp (ISO 8601) */
|
|||
|
|
public readonly ?string $lastPrinted = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for security limits across extractors.
|
|||
|
|
*
|
|||
|
|
* All limits are intentionally conservative to prevent DoS attacks
|
|||
|
|
* while still supporting legitimate documents.
|
|||
|
|
*/
|
|||
|
|
final class SecurityLimits
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Maximum uncompressed size for archives (500 MB) */
|
|||
|
|
public readonly int $maxArchiveSize,
|
|||
|
|
/** @var int Maximum compression ratio before flagging as potential bomb (100:1) */
|
|||
|
|
public readonly int $maxCompressionRatio,
|
|||
|
|
/** @var int Maximum number of files in archive (10,000) */
|
|||
|
|
public readonly int $maxFilesInArchive,
|
|||
|
|
/** @var int Maximum nesting depth for structures (100) */
|
|||
|
|
public readonly int $maxNestingDepth,
|
|||
|
|
/**
|
|||
|
|
* Maximum length of any single XML entity / attribute / token (1 MiB).
|
|||
|
|
* This is a per-token cap, NOT a total cap — billion-laughs class
|
|||
|
|
* attacks where a single entity expands to hundreds of MB are caught
|
|||
|
|
* here, while normal long text content (a paragraph, a CDATA block) is
|
|||
|
|
* caught by `max_content_size` instead.
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $maxEntityLength,
|
|||
|
|
/** @var int Maximum string growth per document (100 MB) */
|
|||
|
|
public readonly int $maxContentSize,
|
|||
|
|
/** @var int Maximum iterations per operation */
|
|||
|
|
public readonly int $maxIterations,
|
|||
|
|
/** @var int Maximum XML depth (100 levels) */
|
|||
|
|
public readonly int $maxXmlDepth,
|
|||
|
|
/** @var int Maximum cells per table (100,000) */
|
|||
|
|
public readonly int $maxTableCells
|
|||
|
|
) { }
|
|||
|
|
public static function default(): SecurityLimits
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
final class TokenReductionConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ReductionLevel */
|
|||
|
|
public readonly ReductionLevel $level,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $preserveMarkdown,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $preserveCode,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $semanticThreshold,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $enableParallel,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $useSimd,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $preservePatterns,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $enableSemanticClustering,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $languageHint = null,
|
|||
|
|
/** @var ?array<string, array<string>> */
|
|||
|
|
public readonly ?array $customStopwords = null,
|
|||
|
|
/** @var ?float */
|
|||
|
|
public readonly ?float $targetReduction = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): TokenReductionConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A PDF annotation extracted from a document page.
|
|||
|
|
*/
|
|||
|
|
final class PdfAnnotation
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var PdfAnnotationType The type of annotation. */
|
|||
|
|
public readonly PdfAnnotationType $annotationType,
|
|||
|
|
/** @var int Page number where the annotation appears (1-indexed). */
|
|||
|
|
public readonly int $pageNumber,
|
|||
|
|
/** @var ?string Text content of the annotation (e.g., comment text, link URL). */
|
|||
|
|
public readonly ?string $content = null,
|
|||
|
|
/** @var ?BoundingBox Bounding box of the annotation on the page. */
|
|||
|
|
public readonly ?BoundingBox $boundingBox = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Comprehensive Djot document structure with semantic preservation.
|
|||
|
|
*
|
|||
|
|
* This type captures the full richness of Djot markup, including:
|
|||
|
|
* - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
|||
|
|
* - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
|||
|
|
* - Attributes (classes, IDs, key-value pairs)
|
|||
|
|
* - Links, images, footnotes
|
|||
|
|
* - Math expressions (inline and display)
|
|||
|
|
* - Tables with full structure
|
|||
|
|
*
|
|||
|
|
* Available when the `djot` feature is enabled.
|
|||
|
|
*/
|
|||
|
|
final class DjotContent
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Plain text representation for backwards compatibility */
|
|||
|
|
public readonly string $plainText,
|
|||
|
|
/** @var array<FormattedBlock> Structured block-level content */
|
|||
|
|
public readonly array $blocks,
|
|||
|
|
/** @var Metadata Metadata from YAML frontmatter */
|
|||
|
|
public readonly Metadata $metadata,
|
|||
|
|
/** @var array<Table> Extracted tables as structured data */
|
|||
|
|
public readonly array $tables,
|
|||
|
|
/** @var array<DjotImage> Extracted images with metadata */
|
|||
|
|
public readonly array $images,
|
|||
|
|
/** @var array<DjotLink> Extracted links with URLs */
|
|||
|
|
public readonly array $links,
|
|||
|
|
/** @var array<Footnote> Footnote definitions */
|
|||
|
|
public readonly array $footnotes,
|
|||
|
|
/** @var array<string> Attributes mapped by element identifier (if present) */
|
|||
|
|
public readonly array $attributes
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Block-level element in a Djot document.
|
|||
|
|
*
|
|||
|
|
* Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
|||
|
|
*/
|
|||
|
|
final class FormattedBlock
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var BlockType Type of block element */
|
|||
|
|
public readonly BlockType $blockType,
|
|||
|
|
/** @var array<InlineElement> Inline content within the block */
|
|||
|
|
public readonly array $inlineContent,
|
|||
|
|
/** @var array<FormattedBlock> Nested blocks for containers (blockquotes, list items, divs) */
|
|||
|
|
public readonly array $children,
|
|||
|
|
/** @var ?int Heading level (1-6) for headings, or nesting level for lists */
|
|||
|
|
public readonly ?int $level = null,
|
|||
|
|
/** @var ?string Element attributes (classes, IDs, key-value pairs) */
|
|||
|
|
public readonly ?string $attributes = null,
|
|||
|
|
/** @var ?string Language identifier for code blocks */
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?string Raw code content for code blocks */
|
|||
|
|
public readonly ?string $code = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Inline element within a block.
|
|||
|
|
*
|
|||
|
|
* Represents text with formatting, links, images, etc.
|
|||
|
|
*/
|
|||
|
|
final class InlineElement
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var InlineType Type of inline element */
|
|||
|
|
public readonly InlineType $elementType,
|
|||
|
|
/** @var string Text content */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var ?string Element attributes */
|
|||
|
|
public readonly ?string $attributes = null,
|
|||
|
|
/** @var ?array<string, string> Additional metadata (e.g., href for links, src/alt for images) */
|
|||
|
|
public readonly ?array $metadata = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image element in Djot.
|
|||
|
|
*/
|
|||
|
|
final class DjotImage
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Image source URL or path */
|
|||
|
|
public readonly string $src,
|
|||
|
|
/** @var string Alternative text */
|
|||
|
|
public readonly string $alt,
|
|||
|
|
/** @var ?string Optional title */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?string Element attributes */
|
|||
|
|
public readonly ?string $attributes = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Link element in Djot.
|
|||
|
|
*/
|
|||
|
|
final class DjotLink
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Link URL */
|
|||
|
|
public readonly string $url,
|
|||
|
|
/** @var string Link text content */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var ?string Optional title */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?string Element attributes */
|
|||
|
|
public readonly ?string $attributes = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Footnote in Djot.
|
|||
|
|
*/
|
|||
|
|
final class Footnote
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Footnote label */
|
|||
|
|
public readonly string $label,
|
|||
|
|
/** @var array<FormattedBlock> Footnote content blocks */
|
|||
|
|
public readonly array $content
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Top-level structured document representation.
|
|||
|
|
*
|
|||
|
|
* A flat array of nodes with index-based parent/child references forming a tree.
|
|||
|
|
* Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
|||
|
|
* to iterate over top-level content by layer.
|
|||
|
|
*
|
|||
|
|
* # Validation
|
|||
|
|
*
|
|||
|
|
* Call `validate()` after construction to verify all node indices are in bounds
|
|||
|
|
* and parent-child relationships are bidirectionally consistent.
|
|||
|
|
*/
|
|||
|
|
final class DocumentStructure
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<DocumentNode> All nodes in document/reading order. */
|
|||
|
|
public readonly array $nodes,
|
|||
|
|
/**
|
|||
|
|
* Resolved relationships between nodes (footnote refs, citations, anchor links, etc.).
|
|||
|
|
*
|
|||
|
|
* Populated during derivation from the internal document representation.
|
|||
|
|
* Empty when no relationships are detected.
|
|||
|
|
*
|
|||
|
|
* @var array<DocumentRelationship>
|
|||
|
|
*/
|
|||
|
|
public readonly array $relationships,
|
|||
|
|
/**
|
|||
|
|
* Sorted, deduplicated list of node type names present in this document.
|
|||
|
|
*
|
|||
|
|
* Each value is the snake_case `node_type` tag of the corresponding
|
|||
|
|
* [`NodeContent`] variant (e.g. `"paragraph"`, `"heading"`, `"table"`, …).
|
|||
|
|
*
|
|||
|
|
* Computed from [`nodes`] via [`DocumentStructure::finalize_node_types`].
|
|||
|
|
* Empty until that method is called (internal construction paths call it
|
|||
|
|
* at the end of derivation).
|
|||
|
|
*
|
|||
|
|
* @var array<string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $nodeTypes,
|
|||
|
|
/**
|
|||
|
|
* Origin format identifier (e.g. "docx", "pptx", "html", "pdf").
|
|||
|
|
*
|
|||
|
|
* Allows renderers to apply format-aware heuristics when converting
|
|||
|
|
* the document tree to output formats.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $sourceFormat = null
|
|||
|
|
) { }
|
|||
|
|
public function finalizeNodeTypes(): void
|
|||
|
|
{ }
|
|||
|
|
public function isEmpty(): bool
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public static function default(): DocumentStructure
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A resolved relationship between two nodes in the document tree.
|
|||
|
|
*/
|
|||
|
|
final class DocumentRelationship
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Source node index (the referencing node). */
|
|||
|
|
public readonly int $source,
|
|||
|
|
/** @var int Target node index (the referenced node). */
|
|||
|
|
public readonly int $target,
|
|||
|
|
/** @var RelationshipKind Semantic kind of the relationship. */
|
|||
|
|
public readonly RelationshipKind $kind
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single node in the document tree.
|
|||
|
|
*
|
|||
|
|
* Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
|||
|
|
* for tree structure, and metadata like page number, bounding box, and content layer.
|
|||
|
|
*/
|
|||
|
|
final class DocumentNode
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Deterministic identifier (hash of content + position). */
|
|||
|
|
public readonly string $id,
|
|||
|
|
/** @var NodeContent Node content — tagged enum, type-specific data only. */
|
|||
|
|
public readonly NodeContent $content,
|
|||
|
|
/** @var array<int> Child node indices in reading order. */
|
|||
|
|
public readonly array $children,
|
|||
|
|
/** @var ContentLayer Content layer classification. */
|
|||
|
|
public readonly ContentLayer $contentLayer,
|
|||
|
|
/**
|
|||
|
|
* Inline annotations (formatting, links) on this node's text content.
|
|||
|
|
*
|
|||
|
|
* Only meaningful for text-carrying nodes; empty for containers.
|
|||
|
|
*
|
|||
|
|
* @var array<TextAnnotation>
|
|||
|
|
*/
|
|||
|
|
public readonly array $annotations,
|
|||
|
|
/** @var ?int Parent node index (`None` = root-level node). */
|
|||
|
|
public readonly ?int $parent = null,
|
|||
|
|
/** @var ?int Page number where this node starts (1-indexed). */
|
|||
|
|
public readonly ?int $page = null,
|
|||
|
|
/** @var ?int Page number where this node ends (for multi-page tables/sections). */
|
|||
|
|
public readonly ?int $pageEnd = null,
|
|||
|
|
/** @var ?BoundingBox Bounding box in document coordinates. */
|
|||
|
|
public readonly ?BoundingBox $bbox = null,
|
|||
|
|
/**
|
|||
|
|
* Format-specific key-value attributes.
|
|||
|
|
*
|
|||
|
|
* Extensible bag for miscellaneous data without a dedicated typed field: CSS classes,
|
|||
|
|
* LaTeX environment names, Excel cell formulas, slide layout names, etc.
|
|||
|
|
*
|
|||
|
|
* @var ?array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $attributes = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Structured table grid with cell-level metadata.
|
|||
|
|
*
|
|||
|
|
* Stores row/column dimensions and a flat list of cells with position info.
|
|||
|
|
*/
|
|||
|
|
final class TableGrid
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Number of rows in the table. */
|
|||
|
|
public readonly int $rows,
|
|||
|
|
/** @var int Number of columns in the table. */
|
|||
|
|
public readonly int $cols,
|
|||
|
|
/** @var array<GridCell> All cells in row-major order. */
|
|||
|
|
public readonly array $cells
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Individual grid cell with position and span metadata.
|
|||
|
|
*/
|
|||
|
|
final class GridCell
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Cell text content. */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var int Zero-indexed row position. */
|
|||
|
|
public readonly int $row,
|
|||
|
|
/** @var int Zero-indexed column position. */
|
|||
|
|
public readonly int $col,
|
|||
|
|
/** @var int Number of rows this cell spans. */
|
|||
|
|
public readonly int $rowSpan,
|
|||
|
|
/** @var int Number of columns this cell spans. */
|
|||
|
|
public readonly int $colSpan,
|
|||
|
|
/** @var bool Whether this is a header cell. */
|
|||
|
|
public readonly bool $isHeader,
|
|||
|
|
/** @var ?BoundingBox Bounding box for this cell (if available). */
|
|||
|
|
public readonly ?BoundingBox $bbox = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Inline text annotation — byte-range based formatting and links.
|
|||
|
|
*
|
|||
|
|
* Annotations reference byte offsets into the node's text content,
|
|||
|
|
* enabling precise identification of formatted regions.
|
|||
|
|
*/
|
|||
|
|
final class TextAnnotation
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Start byte offset in the node's text content (inclusive). */
|
|||
|
|
public readonly int $start,
|
|||
|
|
/** @var int End byte offset in the node's text content (exclusive). */
|
|||
|
|
public readonly int $end,
|
|||
|
|
/** @var AnnotationKind Annotation type. */
|
|||
|
|
public readonly AnnotationKind $kind
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* General extraction result used by the core extraction API.
|
|||
|
|
*
|
|||
|
|
* This is the main result type returned by all extraction functions.
|
|||
|
|
*/
|
|||
|
|
final class ExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $mimeType,
|
|||
|
|
/** @var Metadata */
|
|||
|
|
public readonly Metadata $metadata,
|
|||
|
|
/** @var array<Table> */
|
|||
|
|
public readonly array $tables,
|
|||
|
|
/**
|
|||
|
|
* Non-fatal warnings collected during processing pipeline stages.
|
|||
|
|
*
|
|||
|
|
* Captures errors from optional pipeline features (embedding, chunking,
|
|||
|
|
* language detection, output formatting) that don't prevent extraction
|
|||
|
|
* but may indicate degraded results.
|
|||
|
|
* Previously stored as individual keys in `metadata.additional`.
|
|||
|
|
*
|
|||
|
|
* @var array<ProcessingWarning>
|
|||
|
|
*/
|
|||
|
|
public readonly array $processingWarnings,
|
|||
|
|
/**
|
|||
|
|
* Extraction strategy used to produce the returned text.
|
|||
|
|
*
|
|||
|
|
* Populated when the extractor can reliably distinguish native text extraction,
|
|||
|
|
* OCR-only extraction, or mixed native/OCR output.
|
|||
|
|
*
|
|||
|
|
* @var ?ExtractionMethod
|
|||
|
|
*/
|
|||
|
|
public readonly ?ExtractionMethod $extractionMethod = null,
|
|||
|
|
/** @var ?array<string> */
|
|||
|
|
public readonly ?array $detectedLanguages = null,
|
|||
|
|
/**
|
|||
|
|
* Text chunks when chunking is enabled.
|
|||
|
|
*
|
|||
|
|
* When chunking configuration is provided, the content is split into
|
|||
|
|
* overlapping chunks for efficient processing. Each chunk contains the text,
|
|||
|
|
* optional embeddings (if enabled), and metadata about its position.
|
|||
|
|
*
|
|||
|
|
* @var ?array<Chunk>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $chunks = null,
|
|||
|
|
/**
|
|||
|
|
* Extracted images from the document.
|
|||
|
|
*
|
|||
|
|
* When image extraction is enabled via `ImageExtractionConfig`, this field
|
|||
|
|
* contains all images found in the document with their raw data and metadata.
|
|||
|
|
* Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|||
|
|
*
|
|||
|
|
* @var ?array<ExtractedImage>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $images = null,
|
|||
|
|
/**
|
|||
|
|
* Per-page content when page extraction is enabled.
|
|||
|
|
*
|
|||
|
|
* When page extraction is configured, the document is split into per-page content
|
|||
|
|
* with tables and images mapped to their respective pages.
|
|||
|
|
*
|
|||
|
|
* @var ?array<PageContent>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $pages = null,
|
|||
|
|
/**
|
|||
|
|
* Semantic elements when element-based result format is enabled.
|
|||
|
|
*
|
|||
|
|
* When result_format is set to ElementBased, this field contains semantic
|
|||
|
|
* elements with type classification, unique identifiers, and metadata for
|
|||
|
|
* Unstructured-compatible element-based processing.
|
|||
|
|
*
|
|||
|
|
* @var ?array<Element>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $elements = null,
|
|||
|
|
/**
|
|||
|
|
* Rich Djot content structure (when extracting Djot documents).
|
|||
|
|
*
|
|||
|
|
* When extracting Djot documents with structured extraction enabled,
|
|||
|
|
* this field contains the full semantic structure including:
|
|||
|
|
* - Block-level elements with nesting
|
|||
|
|
* - Inline formatting with attributes
|
|||
|
|
* - Links, images, footnotes
|
|||
|
|
* - Math expressions
|
|||
|
|
* - Complete attribute information
|
|||
|
|
*
|
|||
|
|
* The `content` field still contains plain text for backward compatibility.
|
|||
|
|
*
|
|||
|
|
* Always `None` for non-Djot documents.
|
|||
|
|
*
|
|||
|
|
* @var ?DjotContent
|
|||
|
|
*/
|
|||
|
|
public readonly ?DjotContent $djotContent = null,
|
|||
|
|
/**
|
|||
|
|
* OCR elements with full spatial and confidence metadata.
|
|||
|
|
*
|
|||
|
|
* When OCR is performed with element extraction enabled, this field contains
|
|||
|
|
* the structured representation of detected text including:
|
|||
|
|
* - Bounding geometry (rectangles or quadrilaterals)
|
|||
|
|
* - Confidence scores (detection and recognition)
|
|||
|
|
* - Rotation information
|
|||
|
|
* - Hierarchical relationships (Tesseract only)
|
|||
|
|
*
|
|||
|
|
* This field preserves all metadata that would otherwise be lost when
|
|||
|
|
* converting to plain text or markdown output formats.
|
|||
|
|
*
|
|||
|
|
* Only populated when `OcrElementConfig.include_elements` is true.
|
|||
|
|
*
|
|||
|
|
* @var ?array<OcrElement>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $ocrElements = null,
|
|||
|
|
/**
|
|||
|
|
* Structured document tree (when document structure extraction is enabled).
|
|||
|
|
*
|
|||
|
|
* When `include_document_structure` is true in `ExtractionConfig`, this field
|
|||
|
|
* contains the full hierarchical representation of the document including:
|
|||
|
|
* - Heading-driven section nesting
|
|||
|
|
* - Table grids with cell-level metadata
|
|||
|
|
* - Content layer classification (body, header, footer, footnote)
|
|||
|
|
* - Inline text annotations (formatting, links)
|
|||
|
|
* - Bounding boxes and page numbers
|
|||
|
|
*
|
|||
|
|
* Independent of `result_format` — can be combined with Unified or ElementBased.
|
|||
|
|
*
|
|||
|
|
* @var ?DocumentStructure
|
|||
|
|
*/
|
|||
|
|
public readonly ?DocumentStructure $document = null,
|
|||
|
|
/**
|
|||
|
|
* Extracted keywords when keyword extraction is enabled.
|
|||
|
|
*
|
|||
|
|
* When keyword extraction (RAKE or YAKE) is configured, this field contains
|
|||
|
|
* the extracted keywords with scores, algorithm info, and position data.
|
|||
|
|
* Previously stored in `metadata.additional["keywords"]`.
|
|||
|
|
*
|
|||
|
|
* @var ?array<Keyword>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $extractedKeywords = null,
|
|||
|
|
/**
|
|||
|
|
* Document quality score from quality analysis.
|
|||
|
|
*
|
|||
|
|
* A value between 0.0 and 1.0 indicating the overall text quality.
|
|||
|
|
* Previously stored in `metadata.additional["quality_score"]`.
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $qualityScore = null,
|
|||
|
|
/**
|
|||
|
|
* PDF annotations extracted from the document.
|
|||
|
|
*
|
|||
|
|
* When annotation extraction is enabled via `PdfConfig::extract_annotations`,
|
|||
|
|
* this field contains text notes, highlights, links, stamps, and other
|
|||
|
|
* annotations found in PDF documents.
|
|||
|
|
*
|
|||
|
|
* @var ?array<PdfAnnotation>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $annotations = null,
|
|||
|
|
/**
|
|||
|
|
* Nested extraction results from archive contents.
|
|||
|
|
*
|
|||
|
|
* When extracting archives, each processable file inside produces its own
|
|||
|
|
* full extraction result. Set to `None` for non-archive formats.
|
|||
|
|
* Use `max_archive_depth` in config to control recursion depth.
|
|||
|
|
*
|
|||
|
|
* @var ?array<ArchiveEntry>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $children = null,
|
|||
|
|
/**
|
|||
|
|
* URIs/links discovered during document extraction.
|
|||
|
|
*
|
|||
|
|
* Contains hyperlinks, image references, citations, email addresses, and
|
|||
|
|
* other URI-like references found in the document. Always extracted when
|
|||
|
|
* present in the source document.
|
|||
|
|
*
|
|||
|
|
* @var ?array<ExtractedUri>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $uris = null,
|
|||
|
|
/**
|
|||
|
|
* Tracked changes embedded in the source document.
|
|||
|
|
*
|
|||
|
|
* Populated by per-format extractors that understand change-tracking
|
|||
|
|
* metadata (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`,
|
|||
|
|
* …). Every extractor defaults to `None` until its format-specific
|
|||
|
|
* implementation is added. Extractors that do populate this field follow
|
|||
|
|
* the "accepted-changes" convention: inserted text is present in
|
|||
|
|
* `content`, deleted text is absent — the revision list is the separate
|
|||
|
|
* audit trail.
|
|||
|
|
*
|
|||
|
|
* @var ?array<DocumentRevision>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $revisions = null,
|
|||
|
|
/**
|
|||
|
|
* Structured extraction output from LLM-based JSON schema extraction.
|
|||
|
|
*
|
|||
|
|
* When `structured_extraction` is configured in `ExtractionConfig`, the
|
|||
|
|
* extracted document content is sent to a VLM with the provided JSON schema.
|
|||
|
|
* The response is parsed and stored here as a JSON value matching the schema.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $structuredOutput = null,
|
|||
|
|
/**
|
|||
|
|
* Code intelligence results from tree-sitter analysis.
|
|||
|
|
*
|
|||
|
|
* Populated when extracting source code files with the `tree-sitter` feature.
|
|||
|
|
* Contains metrics, structural analysis, imports/exports, comments,
|
|||
|
|
* docstrings, symbols, diagnostics, and optionally chunked code segments.
|
|||
|
|
*
|
|||
|
|
* Stored as an opaque JSON value so that all language bindings (Go, Java,
|
|||
|
|
* C#, …) can deserialize it as a raw JSON object rather than a typed struct.
|
|||
|
|
* The underlying type is `tree_sitter_language_pack::ProcessResult`.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $codeIntelligence = null,
|
|||
|
|
/**
|
|||
|
|
* LLM token usage and cost data for all LLM calls made during this extraction.
|
|||
|
|
*
|
|||
|
|
* Contains one entry per LLM call. Multiple entries are produced when
|
|||
|
|
* VLM OCR, structured extraction, or LLM embeddings run during
|
|||
|
|
* the same extraction.
|
|||
|
|
*
|
|||
|
|
* `None` when no LLM was used.
|
|||
|
|
*
|
|||
|
|
* @var ?array<LlmUsage>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $llmUsage = null,
|
|||
|
|
/**
|
|||
|
|
* Pre-rendered content in the requested output format.
|
|||
|
|
*
|
|||
|
|
* Populated during `derive_extraction_result` before tree derivation consumes
|
|||
|
|
* element data. `apply_output_format` swaps this into `content` at the end
|
|||
|
|
* of the pipeline, after post-processors have operated on plain text.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $formattedContent = null,
|
|||
|
|
/**
|
|||
|
|
* Structured hOCR document for the OCR+layout pipeline.
|
|||
|
|
*
|
|||
|
|
* When tesseract produces hOCR output, the parsed `InternalDocument` carries
|
|||
|
|
* paragraph structure with bounding boxes and confidence scores. The layout
|
|||
|
|
* classification step enriches these elements before final rendering.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $ocrInternalDocument = null
|
|||
|
|
) { }
|
|||
|
|
public static function fromOcr(OcrExtractionResult $ocr): ExtractionResult
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single file extracted from an archive.
|
|||
|
|
*
|
|||
|
|
* When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
|||
|
|
* enabled, each processable file produces its own full `ExtractionResult`.
|
|||
|
|
*/
|
|||
|
|
final class ArchiveEntry
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Archive-relative file path (e.g. "folder/document.pdf"). */
|
|||
|
|
public readonly string $path,
|
|||
|
|
/** @var string Detected MIME type of the file. */
|
|||
|
|
public readonly string $mimeType,
|
|||
|
|
/** @var ExtractionResult Full extraction result for this file. */
|
|||
|
|
public readonly ExtractionResult $result
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A non-fatal warning from a processing pipeline stage.
|
|||
|
|
*
|
|||
|
|
* Captures errors from optional features that don't prevent extraction
|
|||
|
|
* but may indicate degraded results.
|
|||
|
|
*/
|
|||
|
|
final class ProcessingWarning
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* The pipeline stage or feature that produced this warning
|
|||
|
|
* (e.g., "embedding", "chunking", "language_detection", "output_format").
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $source,
|
|||
|
|
/** @var string Human-readable description of what went wrong. */
|
|||
|
|
public readonly string $message
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Token usage and cost data for a single LLM call made during extraction.
|
|||
|
|
*
|
|||
|
|
* Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
|||
|
|
* are used. Multiple entries may be present when multiple LLM calls occur
|
|||
|
|
* within one extraction (e.g. VLM OCR + structured extraction).
|
|||
|
|
*/
|
|||
|
|
final class LlmUsage
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514"). */
|
|||
|
|
public readonly string $model,
|
|||
|
|
/**
|
|||
|
|
* The pipeline stage that triggered this LLM call
|
|||
|
|
* (e.g. "vlm_ocr", "structured_extraction", "embeddings").
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $source,
|
|||
|
|
/** @var ?int Number of input/prompt tokens consumed. */
|
|||
|
|
public readonly ?int $inputTokens = null,
|
|||
|
|
/** @var ?int Number of output/completion tokens generated. */
|
|||
|
|
public readonly ?int $outputTokens = null,
|
|||
|
|
/** @var ?int Total tokens (input + output). */
|
|||
|
|
public readonly ?int $totalTokens = null,
|
|||
|
|
/** @var ?float Estimated cost in USD based on the provider's published pricing. */
|
|||
|
|
public readonly ?float $estimatedCost = null,
|
|||
|
|
/** @var ?string Why the model stopped generating (e.g. "stop", "length", "content_filter"). */
|
|||
|
|
public readonly ?string $finishReason = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A text chunk with optional embedding and metadata.
|
|||
|
|
*
|
|||
|
|
* Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|||
|
|
* contains the text content, optional embedding vector (if embedding generation
|
|||
|
|
* is configured), and metadata about its position in the document.
|
|||
|
|
*/
|
|||
|
|
final class Chunk
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The text content of this chunk. */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/**
|
|||
|
|
* Semantic structural classification of this chunk.
|
|||
|
|
*
|
|||
|
|
* Assigned by the heuristic classifier based on content patterns and
|
|||
|
|
* heading context. Defaults to `ChunkType::Unknown` when no rule matches.
|
|||
|
|
*
|
|||
|
|
* @var ChunkType
|
|||
|
|
*/
|
|||
|
|
public readonly ChunkType $chunkType,
|
|||
|
|
/** @var ChunkMetadata Metadata about this chunk's position and properties. */
|
|||
|
|
public readonly ChunkMetadata $metadata,
|
|||
|
|
/**
|
|||
|
|
* Optional embedding vector for this chunk.
|
|||
|
|
*
|
|||
|
|
* Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|||
|
|
* The dimensionality depends on the chosen embedding model.
|
|||
|
|
*
|
|||
|
|
* @var ?array<float>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $embedding = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Heading context for a chunk within a Markdown document.
|
|||
|
|
*
|
|||
|
|
* Contains the heading hierarchy from document root to this chunk's section.
|
|||
|
|
*/
|
|||
|
|
final class HeadingContext
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* The heading hierarchy from document root to this chunk's section.
|
|||
|
|
* Index 0 is the outermost (h1), last element is the most specific.
|
|||
|
|
*
|
|||
|
|
* @var array<HeadingLevel>
|
|||
|
|
*/
|
|||
|
|
public readonly array $headings
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single heading in the hierarchy.
|
|||
|
|
*/
|
|||
|
|
final class HeadingLevel
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Heading depth (1 = h1, 2 = h2, etc.) */
|
|||
|
|
public readonly int $level,
|
|||
|
|
/** @var string The text content of the heading. */
|
|||
|
|
public readonly string $text
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Metadata about a chunk's position in the original document.
|
|||
|
|
*/
|
|||
|
|
final class ChunkMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Byte offset where this chunk starts in the original text (UTF-8 valid boundary). */
|
|||
|
|
public readonly int $byteStart,
|
|||
|
|
/** @var int Byte offset where this chunk ends in the original text (UTF-8 valid boundary). */
|
|||
|
|
public readonly int $byteEnd,
|
|||
|
|
/** @var int Zero-based index of this chunk in the document. */
|
|||
|
|
public readonly int $chunkIndex,
|
|||
|
|
/** @var int Total number of chunks in the document. */
|
|||
|
|
public readonly int $totalChunks,
|
|||
|
|
/**
|
|||
|
|
* Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
|||
|
|
*
|
|||
|
|
* Contains zero-based indices into the top-level `images` collection for every
|
|||
|
|
* image whose `page_number` falls within `[first_page, last_page]`.
|
|||
|
|
* Empty when image extraction is disabled or the chunk spans no pages with images.
|
|||
|
|
*
|
|||
|
|
* @var array<int>
|
|||
|
|
*/
|
|||
|
|
public readonly array $imageIndices,
|
|||
|
|
/**
|
|||
|
|
* Number of tokens in this chunk (if available).
|
|||
|
|
*
|
|||
|
|
* This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $tokenCount = null,
|
|||
|
|
/**
|
|||
|
|
* First page number this chunk spans (1-indexed).
|
|||
|
|
*
|
|||
|
|
* Only populated when page tracking is enabled in extraction configuration.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $firstPage = null,
|
|||
|
|
/**
|
|||
|
|
* Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|||
|
|
*
|
|||
|
|
* Only populated when page tracking is enabled in extraction configuration.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $lastPage = null,
|
|||
|
|
/**
|
|||
|
|
* Heading context when using Markdown chunker.
|
|||
|
|
*
|
|||
|
|
* Contains the heading hierarchy this chunk falls under.
|
|||
|
|
* Only populated when `ChunkerType::Markdown` is used.
|
|||
|
|
*
|
|||
|
|
* @var ?HeadingContext
|
|||
|
|
*/
|
|||
|
|
public readonly ?HeadingContext $headingContext = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extracted image from a document.
|
|||
|
|
*
|
|||
|
|
* Contains raw image data, metadata, and optional nested OCR results.
|
|||
|
|
* Raw bytes allow cross-language compatibility - users can convert to
|
|||
|
|
* PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|||
|
|
*/
|
|||
|
|
final class ExtractedImage
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Raw image data (PNG, JPEG, WebP, etc. bytes).
|
|||
|
|
* Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $data,
|
|||
|
|
/**
|
|||
|
|
* Image format (e.g., "jpeg", "png", "webp")
|
|||
|
|
* Uses Cow<'static, str> to avoid allocation for static literals.
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $format,
|
|||
|
|
/** @var int Zero-indexed position of this image in the document/page */
|
|||
|
|
public readonly int $imageIndex,
|
|||
|
|
/** @var bool Whether this image is a mask image */
|
|||
|
|
public readonly bool $isMask,
|
|||
|
|
/** @var ?int Page/slide number where image was found (1-indexed) */
|
|||
|
|
public readonly ?int $pageNumber = null,
|
|||
|
|
/** @var ?int Image width in pixels */
|
|||
|
|
public readonly ?int $width = null,
|
|||
|
|
/** @var ?int Image height in pixels */
|
|||
|
|
public readonly ?int $height = null,
|
|||
|
|
/** @var ?string Colorspace information (e.g., "RGB", "CMYK", "Gray") */
|
|||
|
|
public readonly ?string $colorspace = null,
|
|||
|
|
/** @var ?int Bits per color component (e.g., 8, 16) */
|
|||
|
|
public readonly ?int $bitsPerComponent = null,
|
|||
|
|
/** @var ?string Optional description of the image */
|
|||
|
|
public readonly ?string $description = null,
|
|||
|
|
/**
|
|||
|
|
* Nested OCR extraction result (if image was OCRed)
|
|||
|
|
*
|
|||
|
|
* When OCR is performed on this image, the result is embedded here
|
|||
|
|
* rather than in a separate collection, making the relationship explicit.
|
|||
|
|
*
|
|||
|
|
* @var ?ExtractionResult
|
|||
|
|
*/
|
|||
|
|
public readonly ?ExtractionResult $ocrResult = null,
|
|||
|
|
/**
|
|||
|
|
* Bounding box of the image on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
|||
|
|
* Only populated for PDF-extracted images when position data is available from the PDF extractor.
|
|||
|
|
*
|
|||
|
|
* @var ?BoundingBox
|
|||
|
|
*/
|
|||
|
|
public readonly ?BoundingBox $boundingBox = null,
|
|||
|
|
/**
|
|||
|
|
* Original source path of the image within the document archive (e.g., "media/image1.png" in DOCX).
|
|||
|
|
* Used for rendering image references when the binary data is not extracted.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $sourcePath = null,
|
|||
|
|
/**
|
|||
|
|
* Heuristic classification of what this image likely depicts.
|
|||
|
|
* `None` if classification was disabled or inconclusive.
|
|||
|
|
*
|
|||
|
|
* @var ?ImageKind
|
|||
|
|
*/
|
|||
|
|
public readonly ?ImageKind $imageKind = null,
|
|||
|
|
/** @var ?float Confidence score for `image_kind`, in the range 0.0 to 1.0. */
|
|||
|
|
public readonly ?float $kindConfidence = null,
|
|||
|
|
/**
|
|||
|
|
* Identifier shared across images that form a single logical figure
|
|||
|
|
* (e.g. all raster tiles of one technical drawing). `None` for singletons.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $clusterId = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Bounding box coordinates for element positioning.
|
|||
|
|
*/
|
|||
|
|
final class BoundingBox
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var float Left x-coordinate */
|
|||
|
|
public readonly float $x0,
|
|||
|
|
/** @var float Bottom y-coordinate */
|
|||
|
|
public readonly float $y0,
|
|||
|
|
/** @var float Right x-coordinate */
|
|||
|
|
public readonly float $x1,
|
|||
|
|
/** @var float Top y-coordinate */
|
|||
|
|
public readonly float $y1
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Metadata for a semantic element.
|
|||
|
|
*/
|
|||
|
|
final class ElementMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string, string> Additional custom metadata */
|
|||
|
|
public readonly array $additional,
|
|||
|
|
/** @var ?int Page number (1-indexed) */
|
|||
|
|
public readonly ?int $pageNumber = null,
|
|||
|
|
/** @var ?string Source filename or document name */
|
|||
|
|
public readonly ?string $filename = null,
|
|||
|
|
/** @var ?BoundingBox Bounding box coordinates if available */
|
|||
|
|
public readonly ?BoundingBox $coordinates = null,
|
|||
|
|
/** @var ?int Position index in the element sequence */
|
|||
|
|
public readonly ?int $elementIndex = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Semantic element extracted from document.
|
|||
|
|
*
|
|||
|
|
* Represents a logical unit of content with semantic classification,
|
|||
|
|
* unique identifier, and metadata for tracking origin and position.
|
|||
|
|
*/
|
|||
|
|
final class Element
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Unique element identifier */
|
|||
|
|
public readonly string $elementId,
|
|||
|
|
/** @var ElementType Semantic type of this element */
|
|||
|
|
public readonly ElementType $elementType,
|
|||
|
|
/** @var string Text content of the element */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var ElementMetadata Metadata about the element */
|
|||
|
|
public readonly ElementMetadata $metadata
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Excel workbook representation.
|
|||
|
|
*
|
|||
|
|
* Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|||
|
|
* extracted content and metadata.
|
|||
|
|
*/
|
|||
|
|
final class ExcelWorkbook
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<ExcelSheet> All sheets in the workbook */
|
|||
|
|
public readonly array $sheets,
|
|||
|
|
/** @var array<string, string> Workbook-level metadata (author, creation date, etc.) */
|
|||
|
|
public readonly array $metadata,
|
|||
|
|
/**
|
|||
|
|
* Collaborative-edit revision headers from `xl/revisions/revisionHeaders.xml`.
|
|||
|
|
*
|
|||
|
|
* Populated for legacy shared-workbook `.xlsx` files that contain the
|
|||
|
|
* `xl/revisions/` directory. Each `<header>` element maps to one
|
|||
|
|
* `DocumentRevision { kind: FormatChange }` carrying the header's `guid`
|
|||
|
|
* (→ `revision_id`), `userName` (→ `author`), and `dateTime` (→ `timestamp`).
|
|||
|
|
* `anchor` and `delta` are `None`/empty for v1 (per-cell log parsing is a
|
|||
|
|
* follow-up). `None` when `xl/revisions/revisionHeaders.xml` is absent.
|
|||
|
|
*
|
|||
|
|
* @var ?array<DocumentRevision>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $revisions = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Single Excel worksheet.
|
|||
|
|
*
|
|||
|
|
* Represents one sheet from an Excel workbook with its content
|
|||
|
|
* converted to Markdown format and dimensional statistics.
|
|||
|
|
*/
|
|||
|
|
final class ExcelSheet
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Sheet name as it appears in Excel */
|
|||
|
|
public readonly string $name,
|
|||
|
|
/** @var string Sheet content converted to Markdown tables */
|
|||
|
|
public readonly string $markdown,
|
|||
|
|
/** @var int Number of rows */
|
|||
|
|
public readonly int $rowCount,
|
|||
|
|
/** @var int Number of columns */
|
|||
|
|
public readonly int $colCount,
|
|||
|
|
/** @var int Total number of non-empty cells */
|
|||
|
|
public readonly int $cellCount,
|
|||
|
|
/**
|
|||
|
|
* Pre-extracted table cells (2D vector of cell values)
|
|||
|
|
* Populated during markdown generation to avoid re-parsing markdown.
|
|||
|
|
* None for empty sheets.
|
|||
|
|
*
|
|||
|
|
* @var ?array<array<string>>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $tableCells = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* XML extraction result.
|
|||
|
|
*
|
|||
|
|
* Contains extracted text content from XML files along with
|
|||
|
|
* structural statistics about the XML document.
|
|||
|
|
*/
|
|||
|
|
final class XmlExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Extracted text content (XML structure filtered out) */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var int Total number of XML elements processed */
|
|||
|
|
public readonly int $elementCount,
|
|||
|
|
/** @var array<string> List of unique element names found (sorted) */
|
|||
|
|
public readonly array $uniqueElements
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Plain text and Markdown extraction result.
|
|||
|
|
*
|
|||
|
|
* Contains the extracted text along with statistics and,
|
|||
|
|
* for Markdown files, structural elements like headers and links.
|
|||
|
|
*/
|
|||
|
|
final class TextExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Extracted text content */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var int Number of lines */
|
|||
|
|
public readonly int $lineCount,
|
|||
|
|
/** @var int Number of words */
|
|||
|
|
public readonly int $wordCount,
|
|||
|
|
/** @var int Number of characters */
|
|||
|
|
public readonly int $characterCount,
|
|||
|
|
/** @var ?array<string> Markdown headers (text only, Markdown files only) */
|
|||
|
|
public readonly ?array $headers = null,
|
|||
|
|
/** @var ?array<array<string>> Markdown links as (text, URL) tuples (Markdown files only) */
|
|||
|
|
public readonly ?array $links = null,
|
|||
|
|
/** @var ?array<array<string>> Code blocks as (language, code) tuples (Markdown files only) */
|
|||
|
|
public readonly ?array $codeBlocks = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* PowerPoint (PPTX) extraction result.
|
|||
|
|
*
|
|||
|
|
* Contains extracted slide content, metadata, and embedded images/tables.
|
|||
|
|
*/
|
|||
|
|
final class PptxExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Extracted text content from all slides */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var PptxMetadata Presentation metadata */
|
|||
|
|
public readonly PptxMetadata $metadata,
|
|||
|
|
/** @var int Total number of slides */
|
|||
|
|
public readonly int $slideCount,
|
|||
|
|
/** @var int Total number of embedded images */
|
|||
|
|
public readonly int $imageCount,
|
|||
|
|
/** @var int Total number of tables */
|
|||
|
|
public readonly int $tableCount,
|
|||
|
|
/** @var array<ExtractedImage> Extracted images from the presentation */
|
|||
|
|
public readonly array $images,
|
|||
|
|
/** @var array<string> Hyperlinks discovered in slides as (url, optional_label) pairs. */
|
|||
|
|
public readonly array $hyperlinks,
|
|||
|
|
/**
|
|||
|
|
* Office metadata extracted from docProps/core.xml and docProps/app.xml.
|
|||
|
|
*
|
|||
|
|
* Contains keys like "title", "author", "created_by", "subject", "keywords",
|
|||
|
|
* "modified_by", "created_at", "modified_at", etc.
|
|||
|
|
*
|
|||
|
|
* @var array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $officeMetadata,
|
|||
|
|
/** @var ?PageStructure Slide structure with boundaries (when page tracking is enabled) */
|
|||
|
|
public readonly ?PageStructure $pageStructure = null,
|
|||
|
|
/** @var ?array<PageContent> Per-slide content (when page tracking is enabled) */
|
|||
|
|
public readonly ?array $pageContents = null,
|
|||
|
|
/** @var ?DocumentStructure Structured document representation */
|
|||
|
|
public readonly ?DocumentStructure $document = null,
|
|||
|
|
/**
|
|||
|
|
* Slide comments as revisions.
|
|||
|
|
*
|
|||
|
|
* Each `<p:cm>` element in `ppt/comments/comment{N}.xml` becomes a
|
|||
|
|
* `DocumentRevision { kind: Comment }` with author (resolved from
|
|||
|
|
* `ppt/commentAuthors.xml`), ISO-8601 timestamp, and
|
|||
|
|
* `RevisionAnchor::Slide { index }`. `None` when no comment XML parts exist.
|
|||
|
|
*
|
|||
|
|
* @var ?array<DocumentRevision>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $revisions = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Email extraction result.
|
|||
|
|
*
|
|||
|
|
* Complete representation of an extracted email message (.eml or .msg)
|
|||
|
|
* including headers, body content, and attachments.
|
|||
|
|
*/
|
|||
|
|
final class EmailExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> Primary recipient email addresses */
|
|||
|
|
public readonly array $toEmails,
|
|||
|
|
/** @var array<string> CC recipient email addresses */
|
|||
|
|
public readonly array $ccEmails,
|
|||
|
|
/** @var array<string> BCC recipient email addresses */
|
|||
|
|
public readonly array $bccEmails,
|
|||
|
|
/** @var string Cleaned/processed text content. Aliased as `cleaned_text` for back-compat. */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var array<EmailAttachment> List of email attachments */
|
|||
|
|
public readonly array $attachments,
|
|||
|
|
/** @var array<string, string> Additional email headers and metadata */
|
|||
|
|
public readonly array $metadata,
|
|||
|
|
/** @var ?string Email subject line */
|
|||
|
|
public readonly ?string $subject = null,
|
|||
|
|
/** @var ?string Sender email address */
|
|||
|
|
public readonly ?string $fromEmail = null,
|
|||
|
|
/** @var ?string Email date/timestamp */
|
|||
|
|
public readonly ?string $date = null,
|
|||
|
|
/** @var ?string Message-ID header value */
|
|||
|
|
public readonly ?string $messageId = null,
|
|||
|
|
/** @var ?string Plain text version of the email body */
|
|||
|
|
public readonly ?string $plainText = null,
|
|||
|
|
/** @var ?string HTML version of the email body */
|
|||
|
|
public readonly ?string $htmlContent = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Email attachment representation.
|
|||
|
|
*
|
|||
|
|
* Contains metadata and optionally the content of an email attachment.
|
|||
|
|
*/
|
|||
|
|
final class EmailAttachment
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Whether this attachment is an image */
|
|||
|
|
public readonly bool $isImage,
|
|||
|
|
/** @var ?string Attachment name (from Content-Disposition header) */
|
|||
|
|
public readonly ?string $name = null,
|
|||
|
|
/** @var ?string Filename of the attachment */
|
|||
|
|
public readonly ?string $filename = null,
|
|||
|
|
/** @var ?string MIME type of the attachment */
|
|||
|
|
public readonly ?string $mimeType = null,
|
|||
|
|
/** @var ?int Size in bytes */
|
|||
|
|
public readonly ?int $size = null,
|
|||
|
|
/**
|
|||
|
|
* Attachment data (if extracted).
|
|||
|
|
* Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $data = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* OCR extraction result.
|
|||
|
|
*
|
|||
|
|
* Result of performing OCR on an image or scanned document,
|
|||
|
|
* including recognized text and detected tables.
|
|||
|
|
*/
|
|||
|
|
final class OcrExtractionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Recognized text content */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var string Original MIME type of the processed image */
|
|||
|
|
public readonly string $mimeType,
|
|||
|
|
/** @var array<string, string> OCR processing metadata (confidence scores, language, etc.) */
|
|||
|
|
public readonly array $metadata,
|
|||
|
|
/** @var array<OcrTable> Tables detected and extracted via OCR */
|
|||
|
|
public readonly array $tables,
|
|||
|
|
/**
|
|||
|
|
* Structured OCR elements with bounding boxes and confidence scores.
|
|||
|
|
* Available when TSV output is requested or table detection is enabled.
|
|||
|
|
*
|
|||
|
|
* @var ?array<OcrElement>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $ocrElements = null,
|
|||
|
|
/**
|
|||
|
|
* Structured document produced from hOCR parsing.
|
|||
|
|
* Carries paragraph structure, bounding boxes, and confidence scores
|
|||
|
|
* that the flattened `content` string discards.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $internalDocument = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Table detected via OCR.
|
|||
|
|
*
|
|||
|
|
* Represents a table structure recognized during OCR processing.
|
|||
|
|
*/
|
|||
|
|
final class OcrTable
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
|
|||
|
|
public readonly array $cells,
|
|||
|
|
/** @var string Markdown representation of the table */
|
|||
|
|
public readonly string $markdown,
|
|||
|
|
/** @var int Page number where the table was found (1-indexed) */
|
|||
|
|
public readonly int $pageNumber,
|
|||
|
|
/** @var ?OcrTableBoundingBox Bounding box of the table in pixel coordinates (from OCR word positions). */
|
|||
|
|
public readonly ?OcrTableBoundingBox $boundingBox = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Bounding box for an OCR-detected table in pixel coordinates.
|
|||
|
|
*/
|
|||
|
|
final class OcrTableBoundingBox
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Left x-coordinate (pixels) */
|
|||
|
|
public readonly int $left,
|
|||
|
|
/** @var int Top y-coordinate (pixels) */
|
|||
|
|
public readonly int $top,
|
|||
|
|
/** @var int Right x-coordinate (pixels) */
|
|||
|
|
public readonly int $right,
|
|||
|
|
/** @var int Bottom y-coordinate (pixels) */
|
|||
|
|
public readonly int $bottom
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image preprocessing configuration for OCR.
|
|||
|
|
*
|
|||
|
|
* These settings control how images are preprocessed before OCR to improve
|
|||
|
|
* text recognition quality. Different preprocessing strategies work better
|
|||
|
|
* for different document types.
|
|||
|
|
*/
|
|||
|
|
final class ImagePreprocessingConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Target DPI for the image (300 is standard, 600 for small text). */
|
|||
|
|
public readonly int $targetDpi,
|
|||
|
|
/** @var bool Auto-detect and correct image rotation. */
|
|||
|
|
public readonly bool $autoRotate,
|
|||
|
|
/** @var bool Correct skew (tilted images). */
|
|||
|
|
public readonly bool $deskew,
|
|||
|
|
/** @var bool Remove noise from the image. */
|
|||
|
|
public readonly bool $denoise,
|
|||
|
|
/** @var bool Enhance contrast for better text visibility. */
|
|||
|
|
public readonly bool $contrastEnhance,
|
|||
|
|
/** @var string Binarization method: "otsu", "sauvola", "adaptive". */
|
|||
|
|
public readonly string $binarizationMethod,
|
|||
|
|
/** @var bool Invert colors (white text on black → black on white). */
|
|||
|
|
public readonly bool $invertColors
|
|||
|
|
) { }
|
|||
|
|
public static function default(): ImagePreprocessingConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Tesseract OCR configuration.
|
|||
|
|
*
|
|||
|
|
* Provides fine-grained control over Tesseract OCR engine parameters.
|
|||
|
|
* Most users can use the defaults, but these settings allow optimization
|
|||
|
|
* for specific document types (invoices, handwriting, etc.).
|
|||
|
|
*/
|
|||
|
|
final class TesseractConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Language code (e.g., "eng", "deu", "fra") */
|
|||
|
|
public readonly string $language,
|
|||
|
|
/**
|
|||
|
|
* Page Segmentation Mode (0-13).
|
|||
|
|
*
|
|||
|
|
* Common values:
|
|||
|
|
* - 3: Fully automatic page segmentation (native default)
|
|||
|
|
* - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
|||
|
|
* - 11: Sparse text with no particular order
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $psm,
|
|||
|
|
/** @var string Output format ("text" or "markdown") */
|
|||
|
|
public readonly string $outputFormat,
|
|||
|
|
/**
|
|||
|
|
* OCR Engine Mode (0-3).
|
|||
|
|
*
|
|||
|
|
* - 0: Legacy engine only
|
|||
|
|
* - 1: Neural nets (LSTM) only (usually best)
|
|||
|
|
* - 2: Legacy + LSTM
|
|||
|
|
* - 3: Default (based on what's available)
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $oem,
|
|||
|
|
/**
|
|||
|
|
* Minimum confidence threshold (0.0-100.0).
|
|||
|
|
*
|
|||
|
|
* Words with confidence below this threshold may be rejected or flagged.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $minConfidence,
|
|||
|
|
/** @var bool Enable automatic table detection and reconstruction */
|
|||
|
|
public readonly bool $enableTableDetection,
|
|||
|
|
/** @var float Minimum confidence threshold for table detection (0.0-1.0) */
|
|||
|
|
public readonly float $tableMinConfidence,
|
|||
|
|
/** @var int Column threshold for table detection (pixels) */
|
|||
|
|
public readonly int $tableColumnThreshold,
|
|||
|
|
/** @var float Row threshold ratio for table detection (0.0-1.0) */
|
|||
|
|
public readonly float $tableRowThresholdRatio,
|
|||
|
|
/** @var bool Enable OCR result caching */
|
|||
|
|
public readonly bool $useCache,
|
|||
|
|
/** @var bool Use pre-adapted templates for character classification */
|
|||
|
|
public readonly bool $classifyUsePreAdaptedTemplates,
|
|||
|
|
/** @var bool Enable N-gram language model */
|
|||
|
|
public readonly bool $languageModelNgramOn,
|
|||
|
|
/** @var bool Don't reject good words during block-level processing */
|
|||
|
|
public readonly bool $tesseditDontBlkrejGoodWds,
|
|||
|
|
/** @var bool Don't reject good words during row-level processing */
|
|||
|
|
public readonly bool $tesseditDontRowrejGoodWds,
|
|||
|
|
/** @var bool Enable dictionary correction */
|
|||
|
|
public readonly bool $tesseditEnableDictCorrection,
|
|||
|
|
/** @var string Whitelist of allowed characters (empty = all allowed) */
|
|||
|
|
public readonly string $tesseditCharWhitelist,
|
|||
|
|
/** @var string Blacklist of forbidden characters (empty = none forbidden) */
|
|||
|
|
public readonly string $tesseditCharBlacklist,
|
|||
|
|
/** @var bool Use primary language params model */
|
|||
|
|
public readonly bool $tesseditUsePrimaryParamsModel,
|
|||
|
|
/** @var bool Variable-width space detection */
|
|||
|
|
public readonly bool $textordSpaceSizeIsVariable,
|
|||
|
|
/** @var bool Use adaptive thresholding method */
|
|||
|
|
public readonly bool $thresholdingMethod,
|
|||
|
|
/**
|
|||
|
|
* Image preprocessing configuration.
|
|||
|
|
*
|
|||
|
|
* Controls how images are preprocessed before OCR. Can significantly
|
|||
|
|
* improve quality for scanned documents or low-quality images.
|
|||
|
|
*
|
|||
|
|
* @var ?ImagePreprocessingConfig
|
|||
|
|
*/
|
|||
|
|
public readonly ?ImagePreprocessingConfig $preprocessing = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): TesseractConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image preprocessing metadata.
|
|||
|
|
*
|
|||
|
|
* Tracks the transformations applied to an image during OCR preprocessing,
|
|||
|
|
* including DPI normalization, resizing, and resampling.
|
|||
|
|
*/
|
|||
|
|
final class ImagePreprocessingMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<int> Original image dimensions (width, height) in pixels */
|
|||
|
|
public readonly array $originalDimensions,
|
|||
|
|
/** @var array<float> Original image DPI (horizontal, vertical) */
|
|||
|
|
public readonly array $originalDpi,
|
|||
|
|
/** @var int Target DPI from configuration */
|
|||
|
|
public readonly int $targetDpi,
|
|||
|
|
/** @var float Scaling factor applied to the image */
|
|||
|
|
public readonly float $scaleFactor,
|
|||
|
|
/** @var bool Whether DPI was auto-adjusted based on content */
|
|||
|
|
public readonly bool $autoAdjusted,
|
|||
|
|
/** @var int Final DPI after processing */
|
|||
|
|
public readonly int $finalDpi,
|
|||
|
|
/** @var string Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.) */
|
|||
|
|
public readonly string $resampleMethod,
|
|||
|
|
/** @var bool Whether dimensions were clamped to max_image_dimension */
|
|||
|
|
public readonly bool $dimensionClamped,
|
|||
|
|
/** @var bool Whether resize was skipped (dimensions already optimal) */
|
|||
|
|
public readonly bool $skippedResize,
|
|||
|
|
/** @var ?array<int> New dimensions after resizing (if resized) */
|
|||
|
|
public readonly ?array $newDimensions = null,
|
|||
|
|
/** @var ?int Calculated optimal DPI (if auto_adjust_dpi enabled) */
|
|||
|
|
public readonly ?int $calculatedDpi = null,
|
|||
|
|
/** @var ?string Error message if resize failed */
|
|||
|
|
public readonly ?string $resizeError = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extraction result metadata.
|
|||
|
|
*
|
|||
|
|
* Contains common fields applicable to all formats, format-specific metadata
|
|||
|
|
* via a discriminated union, and additional custom fields from postprocessors.
|
|||
|
|
*/
|
|||
|
|
final class Metadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Whether OCR was used during extraction.
|
|||
|
|
*
|
|||
|
|
* Set to `true` whenever the extraction pipeline ran an OCR backend
|
|||
|
|
* (Tesseract, PaddleOCR, VLM, etc.) and used that output as the primary
|
|||
|
|
* or fallback text. `false` means native text extraction was used exclusively.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $ocrUsed,
|
|||
|
|
/**
|
|||
|
|
* Additional custom fields from postprocessors.
|
|||
|
|
*
|
|||
|
|
* Serialized as a nested `"additional"` object (not flattened at root level).
|
|||
|
|
* Uses `Cow<'static, str>` keys so static string keys avoid allocation.
|
|||
|
|
*
|
|||
|
|
* @var array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $additional,
|
|||
|
|
/** @var ?string Document title */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?string Document subject or description */
|
|||
|
|
public readonly ?string $subject = null,
|
|||
|
|
/** @var ?array<string> Primary author(s) - always Vec for consistency */
|
|||
|
|
public readonly ?array $authors = null,
|
|||
|
|
/** @var ?array<string> Keywords/tags - always Vec for consistency */
|
|||
|
|
public readonly ?array $keywords = null,
|
|||
|
|
/** @var ?string Primary language (ISO 639 code) */
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?string Creation timestamp (ISO 8601 format) */
|
|||
|
|
public readonly ?string $createdAt = null,
|
|||
|
|
/** @var ?string Last modification timestamp (ISO 8601 format) */
|
|||
|
|
public readonly ?string $modifiedAt = null,
|
|||
|
|
/** @var ?string User who created the document */
|
|||
|
|
public readonly ?string $createdBy = null,
|
|||
|
|
/** @var ?string User who last modified the document */
|
|||
|
|
public readonly ?string $modifiedBy = null,
|
|||
|
|
/** @var ?PageStructure Page/slide/sheet structure with boundaries */
|
|||
|
|
public readonly ?PageStructure $pages = null,
|
|||
|
|
/**
|
|||
|
|
* Format-specific metadata (discriminated union)
|
|||
|
|
*
|
|||
|
|
* Contains detailed metadata specific to the document format.
|
|||
|
|
* Serialized as a nested `"format"` object with a `format_type` discriminator field.
|
|||
|
|
*
|
|||
|
|
* @var ?FormatMetadata
|
|||
|
|
*/
|
|||
|
|
public readonly ?FormatMetadata $format = null,
|
|||
|
|
/** @var ?ImagePreprocessingMetadata Image preprocessing metadata (when OCR preprocessing was applied) */
|
|||
|
|
public readonly ?ImagePreprocessingMetadata $imagePreprocessing = null,
|
|||
|
|
/** @var ?string JSON schema (for structured data extraction) */
|
|||
|
|
public readonly ?string $jsonSchema = null,
|
|||
|
|
/** @var ?ErrorMetadata Error metadata (for batch operations) */
|
|||
|
|
public readonly ?ErrorMetadata $error = null,
|
|||
|
|
/**
|
|||
|
|
* Extraction duration in milliseconds (for benchmarking).
|
|||
|
|
*
|
|||
|
|
* This field is populated by batch extraction to provide per-file timing
|
|||
|
|
* information. It's `None` for single-file extraction (which uses external timing).
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $extractionDurationMs = null,
|
|||
|
|
/** @var ?string Document category (from frontmatter or classification). */
|
|||
|
|
public readonly ?string $category = null,
|
|||
|
|
/** @var ?array<string> Document tags (from frontmatter). */
|
|||
|
|
public readonly ?array $tags = null,
|
|||
|
|
/** @var ?string Document version string (from frontmatter). */
|
|||
|
|
public readonly ?string $documentVersion = null,
|
|||
|
|
/** @var ?string Abstract or summary text (from frontmatter). */
|
|||
|
|
public readonly ?string $abstractText = null,
|
|||
|
|
/**
|
|||
|
|
* Output format identifier (e.g., "markdown", "html", "text").
|
|||
|
|
*
|
|||
|
|
* Set by the output format pipeline stage when format conversion is applied.
|
|||
|
|
* Previously stored in `metadata.additional["output_format"]`.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $outputFormat = null
|
|||
|
|
) { }
|
|||
|
|
public function isEmpty(): bool
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Excel/spreadsheet format metadata.
|
|||
|
|
*
|
|||
|
|
* Identifies the document as a spreadsheet source via the `FormatMetadata.Excel`
|
|||
|
|
* discriminant. Sheet count and sheet names are stored inside this struct.
|
|||
|
|
*/
|
|||
|
|
final class ExcelMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?int Number of sheets in the workbook. */
|
|||
|
|
public readonly ?int $sheetCount = null,
|
|||
|
|
/** @var ?array<string> Names of all sheets in the workbook. */
|
|||
|
|
public readonly ?array $sheetNames = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Email metadata extracted from .eml and .msg files.
|
|||
|
|
*
|
|||
|
|
* Includes sender/recipient information, message ID, and attachment list.
|
|||
|
|
*/
|
|||
|
|
final class EmailMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> Primary recipients */
|
|||
|
|
public readonly array $toEmails,
|
|||
|
|
/** @var array<string> CC recipients */
|
|||
|
|
public readonly array $ccEmails,
|
|||
|
|
/** @var array<string> BCC recipients */
|
|||
|
|
public readonly array $bccEmails,
|
|||
|
|
/** @var array<string> List of attachment filenames */
|
|||
|
|
public readonly array $attachments,
|
|||
|
|
/** @var ?string Sender's email address */
|
|||
|
|
public readonly ?string $fromEmail = null,
|
|||
|
|
/** @var ?string Sender's display name */
|
|||
|
|
public readonly ?string $fromName = null,
|
|||
|
|
/** @var ?string Message-ID header value */
|
|||
|
|
public readonly ?string $messageId = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Archive (ZIP/TAR/7Z) metadata.
|
|||
|
|
*
|
|||
|
|
* Extracted from compressed archive files containing file lists and size information.
|
|||
|
|
*/
|
|||
|
|
final class ArchiveMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Archive format ("ZIP", "TAR", "7Z", etc.) */
|
|||
|
|
public readonly string $format,
|
|||
|
|
/** @var int Total number of files in the archive */
|
|||
|
|
public readonly int $fileCount,
|
|||
|
|
/** @var array<string> List of file paths within the archive */
|
|||
|
|
public readonly array $fileList,
|
|||
|
|
/** @var int Total uncompressed size in bytes */
|
|||
|
|
public readonly int $totalSize,
|
|||
|
|
/** @var ?int Compressed size in bytes (if available) */
|
|||
|
|
public readonly ?int $compressedSize = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image metadata extracted from image files.
|
|||
|
|
*
|
|||
|
|
* Includes dimensions, format, and EXIF data.
|
|||
|
|
*/
|
|||
|
|
final class ImageMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Image width in pixels */
|
|||
|
|
public readonly int $width,
|
|||
|
|
/** @var int Image height in pixels */
|
|||
|
|
public readonly int $height,
|
|||
|
|
/** @var string Image format (e.g., "PNG", "JPEG", "TIFF") */
|
|||
|
|
public readonly string $format,
|
|||
|
|
/** @var array<string, string> EXIF metadata tags */
|
|||
|
|
public readonly array $exif
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* XML metadata extracted during XML parsing.
|
|||
|
|
*
|
|||
|
|
* Provides statistics about XML document structure.
|
|||
|
|
*/
|
|||
|
|
final class XmlMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Total number of XML elements processed */
|
|||
|
|
public readonly int $elementCount,
|
|||
|
|
/** @var array<string> List of unique element tag names (sorted) */
|
|||
|
|
public readonly array $uniqueElements
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Text/Markdown metadata.
|
|||
|
|
*
|
|||
|
|
* Extracted from plain text and Markdown files. Includes word counts and,
|
|||
|
|
* for Markdown, structural elements like headers and links.
|
|||
|
|
*/
|
|||
|
|
final class TextMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Number of lines in the document */
|
|||
|
|
public readonly int $lineCount,
|
|||
|
|
/** @var int Number of words */
|
|||
|
|
public readonly int $wordCount,
|
|||
|
|
/** @var int Number of characters */
|
|||
|
|
public readonly int $characterCount,
|
|||
|
|
/** @var ?array<string> Markdown headers (headings text only, for Markdown files) */
|
|||
|
|
public readonly ?array $headers = null,
|
|||
|
|
/** @var ?array<array<string>> Markdown links as (text, url) tuples (for Markdown files) */
|
|||
|
|
public readonly ?array $links = null,
|
|||
|
|
/** @var ?array<array<string>> Code blocks as (language, code) tuples (for Markdown files) */
|
|||
|
|
public readonly ?array $codeBlocks = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Header/heading element metadata.
|
|||
|
|
*/
|
|||
|
|
final class HeaderMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Header level: 1 (h1) through 6 (h6) */
|
|||
|
|
public readonly int $level,
|
|||
|
|
/** @var string Normalized text content of the header */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var int Document tree depth at the header element */
|
|||
|
|
public readonly int $depth,
|
|||
|
|
/** @var int Byte offset in original HTML document */
|
|||
|
|
public readonly int $htmlOffset,
|
|||
|
|
/** @var ?string HTML id attribute if present */
|
|||
|
|
public readonly ?string $id = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Link element metadata.
|
|||
|
|
*/
|
|||
|
|
final class LinkMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The href URL value */
|
|||
|
|
public readonly string $href,
|
|||
|
|
/** @var string Link text content (normalized) */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var LinkType Link type classification */
|
|||
|
|
public readonly LinkType $linkType,
|
|||
|
|
/** @var array<string> Rel attribute values */
|
|||
|
|
public readonly array $rel,
|
|||
|
|
/** @var array<array<string>> Additional attributes as key-value pairs */
|
|||
|
|
public readonly array $attributes,
|
|||
|
|
/** @var ?string Optional title attribute */
|
|||
|
|
public readonly ?string $title = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Image element metadata.
|
|||
|
|
*/
|
|||
|
|
final class ImageMetadataType
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Image source (URL, data URI, or SVG content) */
|
|||
|
|
public readonly string $src,
|
|||
|
|
/** @var ImageType Image type classification */
|
|||
|
|
public readonly ImageType $imageType,
|
|||
|
|
/** @var array<array<string>> Additional attributes as key-value pairs */
|
|||
|
|
public readonly array $attributes,
|
|||
|
|
/** @var ?string Alternative text from alt attribute */
|
|||
|
|
public readonly ?string $alt = null,
|
|||
|
|
/** @var ?string Title attribute */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?array<int> Image dimensions as (width, height) if available */
|
|||
|
|
public readonly ?array $dimensions = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Structured data (Schema.org, microdata, RDFa) block.
|
|||
|
|
*/
|
|||
|
|
final class StructuredData
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var StructuredDataType Type of structured data */
|
|||
|
|
public readonly StructuredDataType $dataType,
|
|||
|
|
/** @var string Raw JSON string representation */
|
|||
|
|
public readonly string $rawJson,
|
|||
|
|
/** @var ?string Schema type if detectable (e.g., "Article", "Event", "Product") */
|
|||
|
|
public readonly ?string $schemaType = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* HTML metadata extracted from HTML documents.
|
|||
|
|
*
|
|||
|
|
* Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|||
|
|
* and extracted structural elements (headers, links, images, structured data).
|
|||
|
|
*/
|
|||
|
|
final class HtmlMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> Document keywords from `<meta name="keywords">` tag, split on commas */
|
|||
|
|
public readonly array $keywords,
|
|||
|
|
/**
|
|||
|
|
* Open Graph metadata (og:* properties) for social media
|
|||
|
|
* Keys like "title", "description", "image", "url", etc.
|
|||
|
|
*
|
|||
|
|
* @var array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $openGraph,
|
|||
|
|
/**
|
|||
|
|
* Twitter Card metadata (twitter:* properties)
|
|||
|
|
* Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|||
|
|
*
|
|||
|
|
* @var array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $twitterCard,
|
|||
|
|
/**
|
|||
|
|
* Additional meta tags not covered by specific fields
|
|||
|
|
* Keys are meta name/property attributes, values are content
|
|||
|
|
*
|
|||
|
|
* @var array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly array $metaTags,
|
|||
|
|
/** @var array<HeaderMetadata> Extracted header elements with hierarchy */
|
|||
|
|
public readonly array $headers,
|
|||
|
|
/** @var array<LinkMetadata> Extracted hyperlinks with type classification */
|
|||
|
|
public readonly array $links,
|
|||
|
|
/** @var array<ImageMetadataType> Extracted images with source and dimensions */
|
|||
|
|
public readonly array $images,
|
|||
|
|
/** @var array<StructuredData> Extracted structured data blocks */
|
|||
|
|
public readonly array $structuredData,
|
|||
|
|
/** @var ?string Document title from `<title>` tag */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?string Document description from `<meta name="description">` tag */
|
|||
|
|
public readonly ?string $description = null,
|
|||
|
|
/** @var ?string Document author from `<meta name="author">` tag */
|
|||
|
|
public readonly ?string $author = null,
|
|||
|
|
/** @var ?string Canonical URL from `<link rel="canonical">` tag */
|
|||
|
|
public readonly ?string $canonicalUrl = null,
|
|||
|
|
/** @var ?string Base URL from `<base href="">` tag for resolving relative URLs */
|
|||
|
|
public readonly ?string $baseHref = null,
|
|||
|
|
/** @var ?string Document language from `lang` attribute */
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?TextDirection Document text direction from `dir` attribute */
|
|||
|
|
public readonly ?TextDirection $textDirection = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* OCR processing metadata.
|
|||
|
|
*
|
|||
|
|
* Captures information about OCR processing configuration and results.
|
|||
|
|
*/
|
|||
|
|
final class OcrMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string OCR language code(s) used */
|
|||
|
|
public readonly string $language,
|
|||
|
|
/** @var int Tesseract Page Segmentation Mode (PSM) */
|
|||
|
|
public readonly int $psm,
|
|||
|
|
/** @var string Output format (e.g., "text", "hocr") */
|
|||
|
|
public readonly string $outputFormat,
|
|||
|
|
/** @var int Number of tables detected */
|
|||
|
|
public readonly int $tableCount,
|
|||
|
|
/** @var ?int */
|
|||
|
|
public readonly ?int $tableRows = null,
|
|||
|
|
/** @var ?int */
|
|||
|
|
public readonly ?int $tableCols = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Error metadata (for batch operations).
|
|||
|
|
*/
|
|||
|
|
final class ErrorMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $errorType,
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $message
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* PowerPoint presentation metadata.
|
|||
|
|
*
|
|||
|
|
* Extracted from PPTX files containing slide counts and presentation details.
|
|||
|
|
*/
|
|||
|
|
final class PptxMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Total number of slides in the presentation */
|
|||
|
|
public readonly int $slideCount,
|
|||
|
|
/** @var array<string> Names of slides (if available) */
|
|||
|
|
public readonly array $slideNames,
|
|||
|
|
/** @var ?int Number of embedded images */
|
|||
|
|
public readonly ?int $imageCount = null,
|
|||
|
|
/** @var ?int Number of tables */
|
|||
|
|
public readonly ?int $tableCount = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Word document metadata.
|
|||
|
|
*
|
|||
|
|
* Extracted from DOCX files using shared Office Open XML metadata extraction.
|
|||
|
|
* Integrates with `office_metadata` module for core/app/custom properties.
|
|||
|
|
*/
|
|||
|
|
final class DocxMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Core properties from docProps/core.xml (Dublin Core metadata)
|
|||
|
|
*
|
|||
|
|
* Contains title, creator, subject, keywords, dates, etc.
|
|||
|
|
* Shared format across DOCX/PPTX/XLSX documents.
|
|||
|
|
*
|
|||
|
|
* @var ?CoreProperties
|
|||
|
|
*/
|
|||
|
|
public readonly ?CoreProperties $coreProperties = null,
|
|||
|
|
/**
|
|||
|
|
* Application properties from docProps/app.xml (Word-specific statistics)
|
|||
|
|
*
|
|||
|
|
* Contains word count, page count, paragraph count, editing time, etc.
|
|||
|
|
* DOCX-specific variant of Office application properties.
|
|||
|
|
*
|
|||
|
|
* @var ?DocxAppProperties
|
|||
|
|
*/
|
|||
|
|
public readonly ?DocxAppProperties $appProperties = null,
|
|||
|
|
/**
|
|||
|
|
* Custom properties from docProps/custom.xml (user-defined properties)
|
|||
|
|
*
|
|||
|
|
* Contains key-value pairs defined by users or applications.
|
|||
|
|
* Values can be strings, numbers, booleans, or dates.
|
|||
|
|
*
|
|||
|
|
* @var ?array<string, string>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $customProperties = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* CSV/TSV file metadata.
|
|||
|
|
*/
|
|||
|
|
final class CsvMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $rowCount,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $columnCount,
|
|||
|
|
/** @var bool */
|
|||
|
|
public readonly bool $hasHeader,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $delimiter = null,
|
|||
|
|
/** @var ?array<string> */
|
|||
|
|
public readonly ?array $columnTypes = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* BibTeX bibliography metadata.
|
|||
|
|
*/
|
|||
|
|
final class BibtexMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Number of entries in the bibliography. */
|
|||
|
|
public readonly int $entryCount,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $citationKeys,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $authors,
|
|||
|
|
/** @var ?YearRange */
|
|||
|
|
public readonly ?YearRange $yearRange = null,
|
|||
|
|
/** @var ?array<string, int> */
|
|||
|
|
public readonly ?array $entryTypes = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Citation file metadata (RIS, PubMed, EndNote).
|
|||
|
|
*/
|
|||
|
|
final class CitationMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $citationCount,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $authors,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $dois,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $keywords,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $format = null,
|
|||
|
|
/** @var ?YearRange */
|
|||
|
|
public readonly ?YearRange $yearRange = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Year range for bibliographic metadata.
|
|||
|
|
*/
|
|||
|
|
final class YearRange
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<int> */
|
|||
|
|
public readonly array $years,
|
|||
|
|
/** @var ?int */
|
|||
|
|
public readonly ?int $min = null,
|
|||
|
|
/** @var ?int */
|
|||
|
|
public readonly ?int $max = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* FictionBook (FB2) metadata.
|
|||
|
|
*/
|
|||
|
|
final class FictionBookMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $genres,
|
|||
|
|
/** @var array<string> */
|
|||
|
|
public readonly array $sequences,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $annotation = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* dBASE (DBF) file metadata.
|
|||
|
|
*/
|
|||
|
|
final class DbfMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $recordCount,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $fieldCount,
|
|||
|
|
/** @var array<DbfFieldInfo> */
|
|||
|
|
public readonly array $fields
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* dBASE field information.
|
|||
|
|
*/
|
|||
|
|
final class DbfFieldInfo
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $name,
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $fieldType
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* JATS (Journal Article Tag Suite) metadata.
|
|||
|
|
*/
|
|||
|
|
final class JatsMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<string, string> */
|
|||
|
|
public readonly array $historyDates,
|
|||
|
|
/** @var array<ContributorRole> */
|
|||
|
|
public readonly array $contributorRoles,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $copyright = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $license = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* JATS contributor with role.
|
|||
|
|
*/
|
|||
|
|
final class ContributorRole
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $name,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $role = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* EPUB metadata (Dublin Core extensions).
|
|||
|
|
*/
|
|||
|
|
final class EpubMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $coverage = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $dcFormat = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $relation = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $source = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $dcType = null,
|
|||
|
|
/** @var ?string */
|
|||
|
|
public readonly ?string $coverImage = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Outlook PST archive metadata.
|
|||
|
|
*/
|
|||
|
|
final class PstMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $messageCount
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Confidence scores for an OCR element.
|
|||
|
|
*
|
|||
|
|
* Separates detection confidence (how confident that text exists at this location)
|
|||
|
|
* from recognition confidence (how confident about the actual text content).
|
|||
|
|
*/
|
|||
|
|
final class OcrConfidence
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Recognition confidence: how confident about the text content.
|
|||
|
|
*
|
|||
|
|
* Range: 0.0 to 1.0.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $recognition,
|
|||
|
|
/**
|
|||
|
|
* Detection confidence: how confident the OCR engine is that text exists here.
|
|||
|
|
*
|
|||
|
|
* PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
|
|||
|
|
* Range: 0.0 to 1.0 (or None if not available).
|
|||
|
|
*
|
|||
|
|
* @var ?float
|
|||
|
|
*/
|
|||
|
|
public readonly ?float $detection = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Rotation information for an OCR element.
|
|||
|
|
*/
|
|||
|
|
final class OcrRotation
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var float Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR). */
|
|||
|
|
public readonly float $angleDegrees,
|
|||
|
|
/** @var ?float Confidence score for the rotation detection. */
|
|||
|
|
public readonly ?float $confidence = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A unified OCR element representing detected text with full metadata.
|
|||
|
|
*
|
|||
|
|
* This is the primary type for structured OCR output, preserving all information
|
|||
|
|
* from both Tesseract and PaddleOCR backends.
|
|||
|
|
*/
|
|||
|
|
final class OcrElement
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The recognized text content. */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var OcrBoundingGeometry Bounding geometry (rectangle or quadrilateral). */
|
|||
|
|
public readonly OcrBoundingGeometry $geometry,
|
|||
|
|
/** @var OcrConfidence Confidence scores for detection and recognition. */
|
|||
|
|
public readonly OcrConfidence $confidence,
|
|||
|
|
/** @var OcrElementLevel Hierarchical level (word, line, block, page). */
|
|||
|
|
public readonly OcrElementLevel $level,
|
|||
|
|
/** @var int Page number (1-indexed). */
|
|||
|
|
public readonly int $pageNumber,
|
|||
|
|
/** @var array<string, string> Backend-specific metadata that doesn't fit the unified schema. */
|
|||
|
|
public readonly array $backendMetadata,
|
|||
|
|
/** @var ?OcrRotation Rotation information (if detected). */
|
|||
|
|
public readonly ?OcrRotation $rotation = null,
|
|||
|
|
/**
|
|||
|
|
* Parent element ID for hierarchical relationships.
|
|||
|
|
*
|
|||
|
|
* Only used for Tesseract output which has word -> line -> block hierarchy.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $parentId = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for OCR element extraction.
|
|||
|
|
*
|
|||
|
|
* Controls how OCR elements are extracted and filtered.
|
|||
|
|
*/
|
|||
|
|
final class OcrElementConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Whether to include OCR elements in the extraction result.
|
|||
|
|
*
|
|||
|
|
* When true, the `ocr_elements` field in `ExtractionResult` will be populated.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $includeElements,
|
|||
|
|
/**
|
|||
|
|
* Minimum hierarchical level to include.
|
|||
|
|
*
|
|||
|
|
* Elements below this level (e.g., words when min_level is Line) will be excluded.
|
|||
|
|
*
|
|||
|
|
* @var OcrElementLevel
|
|||
|
|
*/
|
|||
|
|
public readonly OcrElementLevel $minLevel,
|
|||
|
|
/**
|
|||
|
|
* Minimum recognition confidence threshold (0.0-1.0).
|
|||
|
|
*
|
|||
|
|
* Elements with confidence below this threshold will be filtered out.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $minConfidence,
|
|||
|
|
/**
|
|||
|
|
* Whether to build hierarchical relationships between elements.
|
|||
|
|
*
|
|||
|
|
* When true, `parent_id` fields will be populated based on spatial containment.
|
|||
|
|
* Only meaningful for Tesseract output.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $buildHierarchy
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Unified page structure for documents.
|
|||
|
|
*
|
|||
|
|
* Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|||
|
|
* with character offset boundaries for chunk-to-page mapping.
|
|||
|
|
*/
|
|||
|
|
final class PageStructure
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Total number of pages/slides/sheets */
|
|||
|
|
public readonly int $totalCount,
|
|||
|
|
/** @var PageUnitType Type of paginated unit */
|
|||
|
|
public readonly PageUnitType $unitType,
|
|||
|
|
/**
|
|||
|
|
* Character offset boundaries for each page
|
|||
|
|
*
|
|||
|
|
* Maps character ranges in the extracted content to page numbers.
|
|||
|
|
* Used for chunk page range calculation.
|
|||
|
|
*
|
|||
|
|
* @var ?array<PageBoundary>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $boundaries = null,
|
|||
|
|
/** @var ?array<PageInfo> Detailed per-page metadata (optional, only when needed) */
|
|||
|
|
public readonly ?array $pages = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Byte offset boundary for a page.
|
|||
|
|
*
|
|||
|
|
* Tracks where a specific page's content starts and ends in the main content string,
|
|||
|
|
* enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|||
|
|
* at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|||
|
|
*/
|
|||
|
|
final class PageBoundary
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive) */
|
|||
|
|
public readonly int $byteStart,
|
|||
|
|
/** @var int Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive) */
|
|||
|
|
public readonly int $byteEnd,
|
|||
|
|
/** @var int Page number (1-indexed) */
|
|||
|
|
public readonly int $pageNumber
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Metadata for individual page/slide/sheet.
|
|||
|
|
*
|
|||
|
|
* Captures per-page information including dimensions, content counts,
|
|||
|
|
* and visibility state (for presentations).
|
|||
|
|
*/
|
|||
|
|
final class PageInfo
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Page number (1-indexed) */
|
|||
|
|
public readonly int $number,
|
|||
|
|
/**
|
|||
|
|
* Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
|||
|
|
*
|
|||
|
|
* Indicates the presence of vector-drawn content such as charts, diagrams,
|
|||
|
|
* or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
|
|||
|
|
* invisible to `ExtractionResult.images` since they are not embedded as raster
|
|||
|
|
* XObjects. Set to `true` when path count exceeds a heuristic threshold,
|
|||
|
|
* signaling that downstream consumers may want to rasterize the page to
|
|||
|
|
* capture this content.
|
|||
|
|
*
|
|||
|
|
* Only populated for PDFs; `None` for other document types.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $hasVectorGraphics,
|
|||
|
|
/** @var ?string Page title (usually for presentations) */
|
|||
|
|
public readonly ?string $title = null,
|
|||
|
|
/** @var ?array<float> Dimensions in points (PDF) or pixels (images): (width, height) */
|
|||
|
|
public readonly ?array $dimensions = null,
|
|||
|
|
/** @var ?int Number of images on this page */
|
|||
|
|
public readonly ?int $imageCount = null,
|
|||
|
|
/** @var ?int Number of tables on this page */
|
|||
|
|
public readonly ?int $tableCount = null,
|
|||
|
|
/** @var ?bool Whether this page is hidden (e.g., in presentations) */
|
|||
|
|
public readonly ?bool $hidden = null,
|
|||
|
|
/**
|
|||
|
|
* Whether this page is blank (no meaningful text, no images, no tables)
|
|||
|
|
*
|
|||
|
|
* A page is considered blank if it has fewer than 3 non-whitespace characters
|
|||
|
|
* and contains no tables or images. This is useful for filtering out empty pages
|
|||
|
|
* in scanned documents or PDFs with blank separator pages.
|
|||
|
|
*
|
|||
|
|
* @var ?bool
|
|||
|
|
*/
|
|||
|
|
public readonly ?bool $isBlank = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Content for a single page/slide.
|
|||
|
|
*
|
|||
|
|
* When page extraction is enabled, documents are split into per-page content
|
|||
|
|
* with associated tables and images mapped to each page.
|
|||
|
|
*
|
|||
|
|
* # Performance
|
|||
|
|
*
|
|||
|
|
* Uses Arc-wrapped tables and images for memory efficiency:
|
|||
|
|
* - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|||
|
|
* - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|||
|
|
* - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|||
|
|
*
|
|||
|
|
* This reduces memory overhead for documents with shared tables/images
|
|||
|
|
* by avoiding redundant copies during serialization.
|
|||
|
|
*/
|
|||
|
|
final class PageContent
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Page number (1-indexed) */
|
|||
|
|
public readonly int $pageNumber,
|
|||
|
|
/** @var string Text content for this page */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/**
|
|||
|
|
* Tables found on this page (uses Arc for memory efficiency)
|
|||
|
|
*
|
|||
|
|
* Serializes as Vec<Table> for JSON compatibility while maintaining
|
|||
|
|
* Arc semantics in-memory for zero-copy sharing.
|
|||
|
|
*
|
|||
|
|
* @var array<Table>
|
|||
|
|
*/
|
|||
|
|
public readonly array $tables,
|
|||
|
|
/**
|
|||
|
|
* Indices into `ExtractionResult.images` for images found on this page.
|
|||
|
|
*
|
|||
|
|
* Each value is a zero-based index into the top-level `images` collection.
|
|||
|
|
* Only populated when `extract_images = true` in the extraction config.
|
|||
|
|
*
|
|||
|
|
* @var array<int>
|
|||
|
|
*/
|
|||
|
|
public readonly array $imageIndices,
|
|||
|
|
/**
|
|||
|
|
* Hierarchy information for the page (when hierarchy extraction is enabled)
|
|||
|
|
*
|
|||
|
|
* Contains text hierarchy levels (H1-H6) extracted from the page content.
|
|||
|
|
*
|
|||
|
|
* @var ?PageHierarchy
|
|||
|
|
*/
|
|||
|
|
public readonly ?PageHierarchy $hierarchy = null,
|
|||
|
|
/**
|
|||
|
|
* Whether this page is blank (no meaningful text content)
|
|||
|
|
*
|
|||
|
|
* Determined during extraction based on text content analysis.
|
|||
|
|
* A page is blank if it has fewer than 3 non-whitespace characters
|
|||
|
|
* and contains no tables or images.
|
|||
|
|
*
|
|||
|
|
* @var ?bool
|
|||
|
|
*/
|
|||
|
|
public readonly ?bool $isBlank = null,
|
|||
|
|
/**
|
|||
|
|
* Layout detection regions for this page (when layout detection is enabled).
|
|||
|
|
*
|
|||
|
|
* Contains detected layout regions with class, confidence, bounding box,
|
|||
|
|
* and area fraction. Only populated when layout detection is configured.
|
|||
|
|
*
|
|||
|
|
* @var ?array<LayoutRegion>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $layoutRegions = null,
|
|||
|
|
/**
|
|||
|
|
* Speaker notes for this slide (PPTX only).
|
|||
|
|
*
|
|||
|
|
* Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
|
|||
|
|
* Only populated when the source is a PPTX file and notes are present.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $speakerNotes = null,
|
|||
|
|
/**
|
|||
|
|
* Section name this slide belongs to (PPTX only).
|
|||
|
|
*
|
|||
|
|
* PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
|
|||
|
|
* `ppt/presentation.xml`). Only populated when the source is a PPTX file and
|
|||
|
|
* the slide belongs to a named section.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $sectionName = null,
|
|||
|
|
/**
|
|||
|
|
* Sheet name for this page (XLSX/ODS only).
|
|||
|
|
*
|
|||
|
|
* Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
|
|||
|
|
* sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
|
|||
|
|
* formats and for sheets with an empty name.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $sheetName = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A detected layout region on a page.
|
|||
|
|
*
|
|||
|
|
* When layout detection is enabled, each page may have layout regions
|
|||
|
|
* identifying different content types (text, pictures, tables, etc.)
|
|||
|
|
* with confidence scores and spatial positions.
|
|||
|
|
*/
|
|||
|
|
final class LayoutRegion
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Layout class name (e.g. "picture", "table", "text", "section_header"). */
|
|||
|
|
public readonly string $className,
|
|||
|
|
/** @var float Confidence score from the layout detection model (0.0 to 1.0). */
|
|||
|
|
public readonly float $confidence,
|
|||
|
|
/** @var BoundingBox Bounding box in document coordinate space. */
|
|||
|
|
public readonly BoundingBox $boundingBox,
|
|||
|
|
/** @var float Fraction of the page area covered by this region (0.0 to 1.0). */
|
|||
|
|
public readonly float $areaFraction
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Page hierarchy structure containing heading levels and block information.
|
|||
|
|
*
|
|||
|
|
* Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|||
|
|
* blocks with heading levels (H1-H6) for semantic document structure.
|
|||
|
|
*/
|
|||
|
|
final class PageHierarchy
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Number of hierarchy blocks on this page */
|
|||
|
|
public readonly int $blockCount,
|
|||
|
|
/** @var array<HierarchicalBlock> Hierarchical blocks with heading levels */
|
|||
|
|
public readonly array $blocks
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A text block with hierarchy level assignment.
|
|||
|
|
*
|
|||
|
|
* Represents a block of text with semantic heading information extracted from
|
|||
|
|
* font size clustering and hierarchical analysis.
|
|||
|
|
*/
|
|||
|
|
final class HierarchicalBlock
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The text content of this block */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var float The font size of the text in this block */
|
|||
|
|
public readonly float $fontSize,
|
|||
|
|
/**
|
|||
|
|
* The hierarchy level of this block (H1-H6 or Body)
|
|||
|
|
*
|
|||
|
|
* Levels correspond to HTML heading tags:
|
|||
|
|
* - "h1": Top-level heading
|
|||
|
|
* - "h2": Secondary heading
|
|||
|
|
* - "h3": Tertiary heading
|
|||
|
|
* - "h4": Quaternary heading
|
|||
|
|
* - "h5": Quinary heading
|
|||
|
|
* - "h6": Senary heading
|
|||
|
|
* - "body": Body text (no heading level)
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $level,
|
|||
|
|
/**
|
|||
|
|
* Bounding box information for the block
|
|||
|
|
*
|
|||
|
|
* Contains coordinates as (left, top, right, bottom) in PDF units.
|
|||
|
|
*
|
|||
|
|
* @var ?array<float>
|
|||
|
|
*/
|
|||
|
|
public readonly ?array $bbox = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single changed cell within a table.
|
|||
|
|
*
|
|||
|
|
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
|
|||
|
|
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
* `crate.diff` re-exports this type verbatim.
|
|||
|
|
*/
|
|||
|
|
final class CellChange
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Zero-based row index. */
|
|||
|
|
public readonly int $row,
|
|||
|
|
/** @var int Zero-based column index. */
|
|||
|
|
public readonly int $col,
|
|||
|
|
/** @var string Value before the change. */
|
|||
|
|
public readonly string $from,
|
|||
|
|
/** @var string Value after the change. */
|
|||
|
|
public readonly string $to
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single tracked change embedded in a document.
|
|||
|
|
*
|
|||
|
|
* Populated by per-format extractors that understand change-tracking metadata
|
|||
|
|
* (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
|||
|
|
* extractor defaults to `ExtractionResult.revisions = None` until a
|
|||
|
|
* format-specific implementation is added.
|
|||
|
|
*/
|
|||
|
|
final class DocumentRevision
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Format-specific revision identifier.
|
|||
|
|
*
|
|||
|
|
* For DOCX this is the `w:id` attribute value on the change element
|
|||
|
|
* (e.g. `"42"`). When the attribute is absent a synthetic fallback is
|
|||
|
|
* generated (`"docx-ins-0"`, `"docx-del-3"`, …).
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $revisionId,
|
|||
|
|
/** @var RevisionKind Semantic kind of this revision. */
|
|||
|
|
public readonly RevisionKind $kind,
|
|||
|
|
/** @var RevisionDelta The content changes that make up this revision. */
|
|||
|
|
public readonly RevisionDelta $delta,
|
|||
|
|
/** @var ?string Display name of the author who made this change, when available. */
|
|||
|
|
public readonly ?string $author = null,
|
|||
|
|
/**
|
|||
|
|
* ISO-8601 timestamp of the change, when available.
|
|||
|
|
*
|
|||
|
|
* Stored as a plain string so this type remains FFI-friendly and
|
|||
|
|
* unconditionally available without the `chrono` optional dep.
|
|||
|
|
* DOCX populates this from the `w:date` attribute (e.g.
|
|||
|
|
* `"2024-03-15T10:30:00Z"`).
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $timestamp = null,
|
|||
|
|
/**
|
|||
|
|
* Best-effort document location for this revision.
|
|||
|
|
*
|
|||
|
|
* Resolution is format-dependent and may be `None` when the location
|
|||
|
|
* cannot be determined (e.g. changes inside table cells before
|
|||
|
|
* table-cell anchor support is added).
|
|||
|
|
*
|
|||
|
|
* @var ?RevisionAnchor
|
|||
|
|
*/
|
|||
|
|
public readonly ?RevisionAnchor $anchor = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* The content changes that make up a single revision.
|
|||
|
|
*
|
|||
|
|
* For insertions and deletions the `content` field carries the added/removed
|
|||
|
|
* lines as `DiffLine.Added` / `DiffLine.Removed` entries. For format
|
|||
|
|
* changes, `content` is empty — the property diff is left as a TODO for a
|
|||
|
|
* later enrichment pass.
|
|||
|
|
*/
|
|||
|
|
final class RevisionDelta
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<DiffLine> Line-level content changes for this revision. */
|
|||
|
|
public readonly array $content,
|
|||
|
|
/** @var array<CellChange> Cell-level table changes for this revision. */
|
|||
|
|
public readonly array $tableChanges
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extracted table structure.
|
|||
|
|
*
|
|||
|
|
* Represents a table detected and extracted from a document (PDF, image, etc.).
|
|||
|
|
* Tables are converted to both structured cell data and Markdown format.
|
|||
|
|
*/
|
|||
|
|
final class Table
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<array<string>> Table cells as a 2D vector (rows × columns) */
|
|||
|
|
public readonly array $cells,
|
|||
|
|
/** @var string Markdown representation of the table */
|
|||
|
|
public readonly string $markdown,
|
|||
|
|
/** @var int Page number where the table was found (1-indexed) */
|
|||
|
|
public readonly int $pageNumber,
|
|||
|
|
/**
|
|||
|
|
* Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
|||
|
|
* Only populated for PDF-extracted tables when position data is available.
|
|||
|
|
*
|
|||
|
|
* @var ?BoundingBox
|
|||
|
|
*/
|
|||
|
|
public readonly ?BoundingBox $boundingBox = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Individual table cell with content and optional styling.
|
|||
|
|
*
|
|||
|
|
* Future extension point for rich table support with cell-level metadata.
|
|||
|
|
*/
|
|||
|
|
final class TableCell
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Cell content as text */
|
|||
|
|
public readonly string $content,
|
|||
|
|
/** @var int Row span (number of rows this cell spans) */
|
|||
|
|
public readonly int $rowSpan,
|
|||
|
|
/** @var int Column span (number of columns this cell spans) */
|
|||
|
|
public readonly int $colSpan,
|
|||
|
|
/** @var bool Whether this is a header cell */
|
|||
|
|
public readonly bool $isHeader
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A URI extracted from a document.
|
|||
|
|
*
|
|||
|
|
* Represents any link, reference, or resource pointer found during extraction.
|
|||
|
|
* The `kind` field classifies the URI semantically, while `label` carries
|
|||
|
|
* optional human-readable display text.
|
|||
|
|
*/
|
|||
|
|
final class ExtractedUri
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The URL or path string. */
|
|||
|
|
public readonly string $url,
|
|||
|
|
/** @var UriKind Semantic classification of the URI. */
|
|||
|
|
public readonly UriKind $kind,
|
|||
|
|
/** @var ?string Optional display text / label for the link. */
|
|||
|
|
public readonly ?string $label = null,
|
|||
|
|
/** @var ?int Optional page number where the URI was found (1-indexed). */
|
|||
|
|
public readonly ?int $page = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* MIME type detection response.
|
|||
|
|
*/
|
|||
|
|
final class DetectResponse
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Detected MIME type */
|
|||
|
|
public readonly string $mimeType,
|
|||
|
|
/** @var ?string Original filename (if provided) */
|
|||
|
|
public readonly ?string $filename = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Options controlling how two `ExtractionResult` values are compared.
|
|||
|
|
*/
|
|||
|
|
final class DiffOptions
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var bool Include metadata changes in the diff. Default: `true`. */
|
|||
|
|
public readonly bool $includeMetadata,
|
|||
|
|
/** @var bool Include embedded-children changes in the diff. Default: `true`. */
|
|||
|
|
public readonly bool $includeEmbedded,
|
|||
|
|
/**
|
|||
|
|
* Truncate content to this many characters before diffing.
|
|||
|
|
*
|
|||
|
|
* Useful for very large documents where only the first N characters matter.
|
|||
|
|
* `None` means no truncation.
|
|||
|
|
*
|
|||
|
|
* @var ?int
|
|||
|
|
*/
|
|||
|
|
public readonly ?int $maxContentChars = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): DiffOptions
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* The complete diff between two `ExtractionResult` values.
|
|||
|
|
*/
|
|||
|
|
final class ExtractionDiff
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Unified-diff hunks for the `content` field.
|
|||
|
|
*
|
|||
|
|
* Empty when the content is identical.
|
|||
|
|
*
|
|||
|
|
* @var array<DiffHunk>
|
|||
|
|
*/
|
|||
|
|
public readonly array $contentDiff,
|
|||
|
|
/** @var array<Table> Tables present in `b` but not in `a` (by index position, excess right-side tables). */
|
|||
|
|
public readonly array $tablesAdded,
|
|||
|
|
/** @var array<Table> Tables present in `a` but not in `b` (by index position, excess left-side tables). */
|
|||
|
|
public readonly array $tablesRemoved,
|
|||
|
|
/** @var array<TableDiff> Cell-level changes for table pairs that share the same index and dimensions. */
|
|||
|
|
public readonly array $tablesChanged,
|
|||
|
|
/**
|
|||
|
|
* Metadata difference, encoded as a JSON object with three top-level keys:
|
|||
|
|
* `added` (keys present in `b` but not `a`), `removed` (keys present in `a`
|
|||
|
|
* but not `b`), and `changed` (keys whose values differ — each entry is
|
|||
|
|
* `{ "from": <value-in-a>, "to": <value-in-b> }`).
|
|||
|
|
*
|
|||
|
|
* This is NOT RFC 6902 JSON Patch — we deliberately chose a flatter shape
|
|||
|
|
* to avoid pulling in a json-patch crate. If you need RFC 6902 semantics
|
|||
|
|
* (with JSON Pointer paths) feed `a.metadata` and `b.metadata` to your
|
|||
|
|
* preferred json-patch impl directly.
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $metadataChanged,
|
|||
|
|
/** @var EmbeddedChanges Changes to embedded archive children. */
|
|||
|
|
public readonly EmbeddedChanges $embeddedChanges
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single contiguous hunk in a unified diff.
|
|||
|
|
*/
|
|||
|
|
final class DiffHunk
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Starting line number in the old content (0-indexed). */
|
|||
|
|
public readonly int $fromLine,
|
|||
|
|
/** @var int Number of lines from the old content in this hunk. */
|
|||
|
|
public readonly int $fromCount,
|
|||
|
|
/** @var int Starting line number in the new content (0-indexed). */
|
|||
|
|
public readonly int $toLine,
|
|||
|
|
/** @var int Number of lines from the new content in this hunk. */
|
|||
|
|
public readonly int $toCount,
|
|||
|
|
/** @var array<DiffLine> Lines that make up this hunk. */
|
|||
|
|
public readonly array $lines
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Cell-level changes for a pair of tables that share the same index.
|
|||
|
|
*/
|
|||
|
|
final class TableDiff
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Zero-based index of the table in both `a.tables` and `b.tables`. */
|
|||
|
|
public readonly int $fromIndex,
|
|||
|
|
/** @var int Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables). */
|
|||
|
|
public readonly int $toIndex,
|
|||
|
|
/** @var array<CellChange> Cell-level changes within the table. */
|
|||
|
|
public readonly array $cellChanges
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Changes to embedded archive children between two results.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddedChanges
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var array<ArchiveEntry> Children present in `b` but not in `a` (matched by `path`). */
|
|||
|
|
public readonly array $added,
|
|||
|
|
/** @var array<ArchiveEntry> Children present in `a` but not in `b` (matched by `path`). */
|
|||
|
|
public readonly array $removed,
|
|||
|
|
/**
|
|||
|
|
* Children present in both but with differing content (matched by `path`).
|
|||
|
|
*
|
|||
|
|
* Each entry holds the diff of the nested `ExtractionResult`.
|
|||
|
|
*
|
|||
|
|
* @var array<EmbeddedDiff>
|
|||
|
|
*/
|
|||
|
|
public readonly array $changed
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Diff for a single embedded archive entry that appears in both results.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddedDiff
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Archive-relative path identifying this entry. */
|
|||
|
|
public readonly string $path,
|
|||
|
|
/** @var ExtractionDiff The recursive diff of the entry's extraction result. */
|
|||
|
|
public readonly ExtractionDiff $diff
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Preset configurations for common RAG use cases.
|
|||
|
|
*
|
|||
|
|
* Each preset combines chunk size, overlap, and embedding model
|
|||
|
|
* to provide an optimized configuration for specific scenarios.
|
|||
|
|
*
|
|||
|
|
* All string fields are owned `String` for FFI compatibility — instances
|
|||
|
|
* are safe to clone and pass across language boundaries.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddingPreset
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $name,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $chunkSize,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $overlap,
|
|||
|
|
/** @var string HuggingFace repository name for the model. */
|
|||
|
|
public readonly string $modelRepo,
|
|||
|
|
/** @var string Pooling strategy: "cls" or "mean". */
|
|||
|
|
public readonly string $pooling,
|
|||
|
|
/** @var string Path to the ONNX model file within the repo. */
|
|||
|
|
public readonly string $modelFile,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $dimensions,
|
|||
|
|
/** @var string */
|
|||
|
|
public readonly string $description
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* YAKE-specific parameters.
|
|||
|
|
*/
|
|||
|
|
final class YakeParams
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/**
|
|||
|
|
* Window size for co-occurrence analysis (default: 2).
|
|||
|
|
*
|
|||
|
|
* Controls the context window for computing co-occurrence statistics.
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $windowSize
|
|||
|
|
) { }
|
|||
|
|
public static function default(): YakeParams
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* RAKE-specific parameters.
|
|||
|
|
*/
|
|||
|
|
final class RakeParams
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Minimum word length to consider (default: 1). */
|
|||
|
|
public readonly int $minWordLength,
|
|||
|
|
/** @var int Maximum words in a keyword phrase (default: 3). */
|
|||
|
|
public readonly int $maxWordsPerPhrase
|
|||
|
|
) { }
|
|||
|
|
public static function default(): RakeParams
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Keyword extraction configuration.
|
|||
|
|
*/
|
|||
|
|
final class KeywordConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var KeywordAlgorithm Algorithm to use for extraction. */
|
|||
|
|
public readonly KeywordAlgorithm $algorithm,
|
|||
|
|
/** @var int Maximum number of keywords to extract (default: 10). */
|
|||
|
|
public readonly int $maxKeywords,
|
|||
|
|
/**
|
|||
|
|
* Minimum score threshold (0.0-1.0, default: 0.0).
|
|||
|
|
*
|
|||
|
|
* Keywords with scores below this threshold are filtered out.
|
|||
|
|
* Note: Score ranges differ between algorithms.
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $minScore,
|
|||
|
|
/**
|
|||
|
|
* N-gram range for keyword extraction (min, max).
|
|||
|
|
*
|
|||
|
|
* (1, 1) = unigrams only
|
|||
|
|
* (1, 2) = unigrams and bigrams
|
|||
|
|
* (1, 3) = unigrams, bigrams, and trigrams (default)
|
|||
|
|
*
|
|||
|
|
* @var array<int>
|
|||
|
|
*/
|
|||
|
|
public readonly array $ngramRange,
|
|||
|
|
/**
|
|||
|
|
* Language code for stopword filtering (e.g., "en", "de", "fr").
|
|||
|
|
*
|
|||
|
|
* If None, no stopword filtering is applied.
|
|||
|
|
*
|
|||
|
|
* @var ?string
|
|||
|
|
*/
|
|||
|
|
public readonly ?string $language = null,
|
|||
|
|
/** @var ?YakeParams YAKE-specific tuning parameters. */
|
|||
|
|
public readonly ?YakeParams $yakeParams = null,
|
|||
|
|
/** @var ?RakeParams RAKE-specific tuning parameters. */
|
|||
|
|
public readonly ?RakeParams $rakeParams = null
|
|||
|
|
) { }
|
|||
|
|
public static function default(): KeywordConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Extracted keyword with metadata.
|
|||
|
|
*/
|
|||
|
|
final class Keyword
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The keyword text. */
|
|||
|
|
public readonly string $text,
|
|||
|
|
/** @var float Relevance score (higher is better, algorithm-specific range). */
|
|||
|
|
public readonly float $score,
|
|||
|
|
/** @var KeywordAlgorithm Algorithm that extracted this keyword. */
|
|||
|
|
public readonly KeywordAlgorithm $algorithm,
|
|||
|
|
/** @var ?array<int> Optional positions where keyword appears in text (character offsets). */
|
|||
|
|
public readonly ?array $positions = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Configuration for PaddleOCR backend.
|
|||
|
|
*
|
|||
|
|
* Configures PaddleOCR text detection and recognition with multi-language support.
|
|||
|
|
* Uses a builder pattern for convenient configuration.
|
|||
|
|
*
|
|||
|
|
* # Examples
|
|||
|
|
*/
|
|||
|
|
final class PaddleOcrConfig
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Language code (e.g., "en", "ch", "jpn", "kor", "deu", "fra") */
|
|||
|
|
public readonly string $language,
|
|||
|
|
/**
|
|||
|
|
* Enable angle classification for rotated text (default: false).
|
|||
|
|
* Can misfire on short text regions, rotating crops incorrectly before recognition.
|
|||
|
|
*
|
|||
|
|
* @var bool
|
|||
|
|
*/
|
|||
|
|
public readonly bool $useAngleCls,
|
|||
|
|
/** @var bool Enable table structure detection (default: false) */
|
|||
|
|
public readonly bool $enableTableDetection,
|
|||
|
|
/**
|
|||
|
|
* Database threshold for text detection (default: 0.3)
|
|||
|
|
* Range: 0.0-1.0, higher values require more confident detections
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $detDbThresh,
|
|||
|
|
/**
|
|||
|
|
* Box threshold for text bounding box refinement (default: 0.5)
|
|||
|
|
* Range: 0.0-1.0
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $detDbBoxThresh,
|
|||
|
|
/**
|
|||
|
|
* Unclip ratio for expanding text bounding boxes (default: 1.6)
|
|||
|
|
* Controls the expansion of detected text regions
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $detDbUnclipRatio,
|
|||
|
|
/**
|
|||
|
|
* Maximum side length for detection image (default: 960)
|
|||
|
|
* Larger images may be resized to this limit for faster inference
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $detLimitSideLen,
|
|||
|
|
/**
|
|||
|
|
* Batch size for recognition inference (default: 6)
|
|||
|
|
* Number of text regions to process simultaneously
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $recBatchNum,
|
|||
|
|
/**
|
|||
|
|
* Padding in pixels added around the image before detection (default: 10).
|
|||
|
|
* Large values can include surrounding content like table gridlines.
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $padding,
|
|||
|
|
/**
|
|||
|
|
* Minimum recognition confidence score for text lines (default: 0.5).
|
|||
|
|
* Text regions with recognition confidence below this threshold are discarded.
|
|||
|
|
* Matches PaddleOCR Python's `drop_score` parameter.
|
|||
|
|
* Range: 0.0-1.0
|
|||
|
|
*
|
|||
|
|
* @var float
|
|||
|
|
*/
|
|||
|
|
public readonly float $dropScore,
|
|||
|
|
/**
|
|||
|
|
* Model tier controlling detection/recognition model size and accuracy trade-off.
|
|||
|
|
* - `"mobile"` (default): Lightweight models (~4.5MB detection, ~16.5MB recognition), fast download and inference
|
|||
|
|
* - `"server"`: Large, high-accuracy models (~88MB detection, ~84MB recognition), best for GPU or complex documents
|
|||
|
|
*
|
|||
|
|
* @var string
|
|||
|
|
*/
|
|||
|
|
public readonly string $modelTier,
|
|||
|
|
/** @var ?string Optional custom cache directory for model files */
|
|||
|
|
public readonly ?string $cacheDir = null
|
|||
|
|
) { }
|
|||
|
|
public function withCacheDir(string $path): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withTableDetection(bool $enable): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withAngleCls(bool $enable): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withDetDbThresh(float $threshold): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withDetDbBoxThresh(float $threshold): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withDetDbUnclipRatio(float $ratio): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withDetLimitSideLen(int $length): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withRecBatchNum(int $batch_size): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withDropScore(float $score): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withPadding(int $padding): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public function withModelTier(string $tier): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
public static function default(): PaddleOcrConfig
|
|||
|
|
{ throw new \RuntimeException('Not implemented — provided by the native extension.'); }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Combined paths to all models needed for OCR (backward compatibility).
|
|||
|
|
*/
|
|||
|
|
final class ModelPaths
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string Path to the detection model directory. */
|
|||
|
|
public readonly string $detModel,
|
|||
|
|
/** @var string Path to the classification model directory. */
|
|||
|
|
public readonly string $clsModel,
|
|||
|
|
/** @var string Path to the recognition model directory. */
|
|||
|
|
public readonly string $recModel,
|
|||
|
|
/** @var string Path to the character dictionary file. */
|
|||
|
|
public readonly string $dictFile
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Document orientation detection result.
|
|||
|
|
*/
|
|||
|
|
final class OrientationResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int Detected orientation in degrees (0, 90, 180, or 270). */
|
|||
|
|
public readonly int $degrees,
|
|||
|
|
/** @var float Confidence score (0.0-1.0). */
|
|||
|
|
public readonly float $confidence
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
|
|||
|
|
*/
|
|||
|
|
final class BBox
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $x1,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $y1,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $x2,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $y2
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single layout detection result.
|
|||
|
|
*/
|
|||
|
|
final class LayoutDetection
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var LayoutClass */
|
|||
|
|
public readonly LayoutClass $className,
|
|||
|
|
/** @var float */
|
|||
|
|
public readonly float $confidence,
|
|||
|
|
/** @var BBox */
|
|||
|
|
public readonly BBox $bbox
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Pre-computed table markdown for a table detection region.
|
|||
|
|
*
|
|||
|
|
* Produced by the TATR-based table structure recognizer and surfaced as part of
|
|||
|
|
* layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
|
|||
|
|
* so that consumers who do not enable `layout-detection` (ORT) can still reference
|
|||
|
|
* the type in their own code.
|
|||
|
|
*/
|
|||
|
|
final class RecognizedTable
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var BBox Detection bbox that this table corresponds to (for matching). */
|
|||
|
|
public readonly BBox $detectionBbox,
|
|||
|
|
/** @var array<array<string>> Table cells as a 2D vector (rows × columns). */
|
|||
|
|
public readonly array $cells,
|
|||
|
|
/** @var string Rendered markdown table. */
|
|||
|
|
public readonly string $markdown
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Page-level detection result containing all detections and page metadata.
|
|||
|
|
*/
|
|||
|
|
final class DetectionResult
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $pageWidth,
|
|||
|
|
/** @var int */
|
|||
|
|
public readonly int $pageHeight,
|
|||
|
|
/** @var array<LayoutDetection> */
|
|||
|
|
public readonly array $detections
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Embedded file descriptor extracted from the PDF name tree.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddedFile
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var string The filename as stored in the PDF name tree. */
|
|||
|
|
public readonly string $name,
|
|||
|
|
/** @var string Raw file bytes from the embedded stream (already decompressed by lopdf). */
|
|||
|
|
public readonly string $data,
|
|||
|
|
/**
|
|||
|
|
* Compressed byte count of the original stream (before decompression).
|
|||
|
|
*
|
|||
|
|
* Used by callers to compute the decompression ratio and detect zip-bomb-style
|
|||
|
|
* attacks that embed a tiny compressed stream expanding to gigabytes of data.
|
|||
|
|
*
|
|||
|
|
* @var int
|
|||
|
|
*/
|
|||
|
|
public readonly int $compressedSize,
|
|||
|
|
/** @var ?string MIME type if specified in the filespec, otherwise `None`. */
|
|||
|
|
public readonly ?string $mimeType = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* PDF-specific metadata.
|
|||
|
|
*
|
|||
|
|
* Contains metadata fields specific to PDF documents that are not in the common
|
|||
|
|
* `Metadata` structure. Common fields like title, authors, keywords, and dates
|
|||
|
|
* are at the `Metadata` level.
|
|||
|
|
*/
|
|||
|
|
final class PdfMetadata
|
|||
|
|
{
|
|||
|
|
public function __construct(
|
|||
|
|
/** @var ?string PDF version (e.g., "1.7", "2.0") */
|
|||
|
|
public readonly ?string $pdfVersion = null,
|
|||
|
|
/** @var ?string PDF producer (application that created the PDF) */
|
|||
|
|
public readonly ?string $producer = null,
|
|||
|
|
/** @var ?bool Whether the PDF is encrypted/password-protected */
|
|||
|
|
public readonly ?bool $isEncrypted = null,
|
|||
|
|
/** @var ?int First page width in points (1/72 inch) */
|
|||
|
|
public readonly ?int $width = null,
|
|||
|
|
/** @var ?int First page height in points (1/72 inch) */
|
|||
|
|
public readonly ?int $height = null,
|
|||
|
|
/** @var ?int Total number of pages in the PDF document */
|
|||
|
|
public readonly ?int $pageCount = null
|
|||
|
|
) { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ExecutionProviderType: string
|
|||
|
|
{
|
|||
|
|
case Auto = 'Auto';
|
|||
|
|
case Cpu = 'Cpu';
|
|||
|
|
case CoreMl = 'CoreMl';
|
|||
|
|
case Cuda = 'Cuda';
|
|||
|
|
case TensorRt = 'TensorRt';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum OutputFormat: string
|
|||
|
|
{
|
|||
|
|
case Plain = 'Plain';
|
|||
|
|
case Markdown = 'Markdown';
|
|||
|
|
case Djot = 'Djot';
|
|||
|
|
case Html = 'Html';
|
|||
|
|
case Json = 'Json';
|
|||
|
|
case Structured = 'Structured';
|
|||
|
|
case Custom = 'Custom';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum HtmlTheme: string
|
|||
|
|
{
|
|||
|
|
case Default = 'Default';
|
|||
|
|
case GitHub = 'GitHub';
|
|||
|
|
case Dark = 'Dark';
|
|||
|
|
case Light = 'Light';
|
|||
|
|
case Unstyled = 'Unstyled';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum TableModel: string
|
|||
|
|
{
|
|||
|
|
case Tatr = 'tatr';
|
|||
|
|
case SlanetWired = 'slanet_wired';
|
|||
|
|
case SlanetWireless = 'slanet_wireless';
|
|||
|
|
case SlanetPlus = 'slanet_plus';
|
|||
|
|
case SlanetAuto = 'slanet_auto';
|
|||
|
|
case Disabled = 'disabled';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ChunkerType: string
|
|||
|
|
{
|
|||
|
|
case Text = 'Text';
|
|||
|
|
case Markdown = 'Markdown';
|
|||
|
|
case Yaml = 'Yaml';
|
|||
|
|
case Semantic = 'Semantic';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* How chunk size is measured.
|
|||
|
|
*
|
|||
|
|
* Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
|||
|
|
* chunks are sized by token count according to the specified tokenizer.
|
|||
|
|
*
|
|||
|
|
* Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
|||
|
|
* available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
|||
|
|
* (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
|||
|
|
*/
|
|||
|
|
final class ChunkSizing
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Embedding model types supported by Kreuzberg.
|
|||
|
|
*/
|
|||
|
|
final class EmbeddingModelType
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum CodeContentMode: string
|
|||
|
|
{
|
|||
|
|
case Chunks = 'chunks';
|
|||
|
|
case Raw = 'raw';
|
|||
|
|
case Structure = 'structure';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ListType: string
|
|||
|
|
{
|
|||
|
|
case Bullet = 'Bullet';
|
|||
|
|
case Numbered = 'Numbered';
|
|||
|
|
case Lettered = 'Lettered';
|
|||
|
|
case Indented = 'Indented';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum OcrBackendType: string
|
|||
|
|
{
|
|||
|
|
case Tesseract = 'Tesseract';
|
|||
|
|
case EasyOCR = 'EasyOCR';
|
|||
|
|
case PaddleOCR = 'PaddleOCR';
|
|||
|
|
case Custom = 'Custom';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ProcessingStage: string
|
|||
|
|
{
|
|||
|
|
case Early = 'Early';
|
|||
|
|
case Middle = 'Middle';
|
|||
|
|
case Late = 'Late';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ReductionLevel: string
|
|||
|
|
{
|
|||
|
|
case Off = 'Off';
|
|||
|
|
case Light = 'Light';
|
|||
|
|
case Moderate = 'Moderate';
|
|||
|
|
case Aggressive = 'Aggressive';
|
|||
|
|
case Maximum = 'Maximum';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum PdfAnnotationType: string
|
|||
|
|
{
|
|||
|
|
case Text = 'text';
|
|||
|
|
case Highlight = 'highlight';
|
|||
|
|
case Link = 'link';
|
|||
|
|
case Stamp = 'stamp';
|
|||
|
|
case Underline = 'underline';
|
|||
|
|
case StrikeOut = 'strike_out';
|
|||
|
|
case Other = 'other';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum BlockType: string
|
|||
|
|
{
|
|||
|
|
case Paragraph = 'paragraph';
|
|||
|
|
case Heading = 'heading';
|
|||
|
|
case Blockquote = 'blockquote';
|
|||
|
|
case CodeBlock = 'code_block';
|
|||
|
|
case ListItem = 'list_item';
|
|||
|
|
case OrderedList = 'ordered_list';
|
|||
|
|
case BulletList = 'bullet_list';
|
|||
|
|
case TaskList = 'task_list';
|
|||
|
|
case DefinitionList = 'definition_list';
|
|||
|
|
case DefinitionTerm = 'definition_term';
|
|||
|
|
case DefinitionDescription = 'definition_description';
|
|||
|
|
case Div = 'div';
|
|||
|
|
case Section = 'section';
|
|||
|
|
case ThematicBreak = 'thematic_break';
|
|||
|
|
case RawBlock = 'raw_block';
|
|||
|
|
case MathDisplay = 'math_display';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum InlineType: string
|
|||
|
|
{
|
|||
|
|
case Text = 'text';
|
|||
|
|
case Strong = 'strong';
|
|||
|
|
case Emphasis = 'emphasis';
|
|||
|
|
case Highlight = 'highlight';
|
|||
|
|
case Subscript = 'subscript';
|
|||
|
|
case Superscript = 'superscript';
|
|||
|
|
case Insert = 'insert';
|
|||
|
|
case Delete = 'delete';
|
|||
|
|
case Code = 'code';
|
|||
|
|
case Link = 'link';
|
|||
|
|
case Image = 'image';
|
|||
|
|
case Span = 'span';
|
|||
|
|
case Math = 'math';
|
|||
|
|
case RawInline = 'raw_inline';
|
|||
|
|
case FootnoteRef = 'footnote_ref';
|
|||
|
|
case Symbol = 'symbol';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum RelationshipKind: string
|
|||
|
|
{
|
|||
|
|
case FootnoteReference = 'footnote_reference';
|
|||
|
|
case CitationReference = 'citation_reference';
|
|||
|
|
case InternalLink = 'internal_link';
|
|||
|
|
case Caption = 'caption';
|
|||
|
|
case Label = 'label';
|
|||
|
|
case TocEntry = 'toc_entry';
|
|||
|
|
case CrossReference = 'cross_reference';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ContentLayer: string
|
|||
|
|
{
|
|||
|
|
case Body = 'body';
|
|||
|
|
case Header = 'header';
|
|||
|
|
case Footer = 'footer';
|
|||
|
|
case Footnote = 'footnote';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Tagged enum for node content. Each variant carries only type-specific data.
|
|||
|
|
*
|
|||
|
|
* Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
|||
|
|
* Go/Java/TypeScript bindings.
|
|||
|
|
*/
|
|||
|
|
final class NodeContent
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Types of inline text annotations.
|
|||
|
|
*/
|
|||
|
|
final class AnnotationKind
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ExtractionMethod: string
|
|||
|
|
{
|
|||
|
|
case Native = 'native';
|
|||
|
|
case Ocr = 'ocr';
|
|||
|
|
case Mixed = 'mixed';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ChunkType: string
|
|||
|
|
{
|
|||
|
|
case Heading = 'heading';
|
|||
|
|
case PartyList = 'party_list';
|
|||
|
|
case Definitions = 'definitions';
|
|||
|
|
case OperativeClause = 'operative_clause';
|
|||
|
|
case SignatureBlock = 'signature_block';
|
|||
|
|
case Schedule = 'schedule';
|
|||
|
|
case TableLike = 'table_like';
|
|||
|
|
case Formula = 'formula';
|
|||
|
|
case CodeBlock = 'code_block';
|
|||
|
|
case Image = 'image';
|
|||
|
|
case OrgChart = 'org_chart';
|
|||
|
|
case Diagram = 'diagram';
|
|||
|
|
case Unknown = 'unknown';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ImageKind: string
|
|||
|
|
{
|
|||
|
|
case Photograph = 'photograph';
|
|||
|
|
case Diagram = 'diagram';
|
|||
|
|
case Chart = 'chart';
|
|||
|
|
case Drawing = 'drawing';
|
|||
|
|
case TextBlock = 'text_block';
|
|||
|
|
case Decoration = 'decoration';
|
|||
|
|
case Logo = 'logo';
|
|||
|
|
case Icon = 'icon';
|
|||
|
|
case TileFragment = 'tile_fragment';
|
|||
|
|
case Mask = 'mask';
|
|||
|
|
case PageRaster = 'page_raster';
|
|||
|
|
case Unknown = 'unknown';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ResultFormat: string
|
|||
|
|
{
|
|||
|
|
case Unified = 'unified';
|
|||
|
|
case ElementBased = 'element_based';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ElementType: string
|
|||
|
|
{
|
|||
|
|
case Title = 'title';
|
|||
|
|
case NarrativeText = 'narrative_text';
|
|||
|
|
case Heading = 'heading';
|
|||
|
|
case ListItem = 'list_item';
|
|||
|
|
case Table = 'table';
|
|||
|
|
case Image = 'image';
|
|||
|
|
case PageBreak = 'page_break';
|
|||
|
|
case CodeBlock = 'code_block';
|
|||
|
|
case BlockQuote = 'block_quote';
|
|||
|
|
case Footer = 'footer';
|
|||
|
|
case Header = 'header';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Format-specific metadata (discriminated union).
|
|||
|
|
*
|
|||
|
|
* Only one format type can exist per extraction result. This provides
|
|||
|
|
* type-safe, clean metadata without nested optionals.
|
|||
|
|
*/
|
|||
|
|
final class FormatMetadata
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum TextDirection: string
|
|||
|
|
{
|
|||
|
|
case LeftToRight = 'ltr';
|
|||
|
|
case RightToLeft = 'rtl';
|
|||
|
|
case Auto = 'auto';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum LinkType: string
|
|||
|
|
{
|
|||
|
|
case Anchor = 'Anchor';
|
|||
|
|
case Internal = 'Internal';
|
|||
|
|
case External = 'External';
|
|||
|
|
case Email = 'Email';
|
|||
|
|
case Phone = 'Phone';
|
|||
|
|
case Other = 'Other';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum ImageType: string
|
|||
|
|
{
|
|||
|
|
case DataUri = 'data-uri';
|
|||
|
|
case InlineSvg = 'inline-svg';
|
|||
|
|
case External = 'External';
|
|||
|
|
case Relative = 'Relative';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum StructuredDataType: string
|
|||
|
|
{
|
|||
|
|
case JsonLd = 'json-ld';
|
|||
|
|
case Microdata = 'Microdata';
|
|||
|
|
case RDFa = 'rdfa';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Bounding geometry for an OCR element.
|
|||
|
|
*
|
|||
|
|
* Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
|||
|
|
* (from PaddleOCR and rotated text detection).
|
|||
|
|
*/
|
|||
|
|
final class OcrBoundingGeometry
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum OcrElementLevel: string
|
|||
|
|
{
|
|||
|
|
case Word = 'word';
|
|||
|
|
case Line = 'line';
|
|||
|
|
case Block = 'block';
|
|||
|
|
case Page = 'page';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum PageUnitType: string
|
|||
|
|
{
|
|||
|
|
case Page = 'page';
|
|||
|
|
case Slide = 'slide';
|
|||
|
|
case Sheet = 'sheet';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* A single line in a unified-diff hunk.
|
|||
|
|
*
|
|||
|
|
* Defined here (rather than only in `crate.diff`) so `RevisionDelta` can
|
|||
|
|
* reference it unconditionally, without requiring the `diff` Cargo feature.
|
|||
|
|
* `crate.diff` re-exports this type verbatim.
|
|||
|
|
*/
|
|||
|
|
final class DiffLine
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum RevisionKind: string
|
|||
|
|
{
|
|||
|
|
case Insertion = 'insertion';
|
|||
|
|
case Deletion = 'deletion';
|
|||
|
|
case FormatChange = 'format_change';
|
|||
|
|
case Comment = 'comment';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Best-effort document location for a revision.
|
|||
|
|
*/
|
|||
|
|
final class RevisionAnchor
|
|||
|
|
{
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum UriKind: string
|
|||
|
|
{
|
|||
|
|
case Hyperlink = 'hyperlink';
|
|||
|
|
case Image = 'image';
|
|||
|
|
case Anchor = 'anchor';
|
|||
|
|
case Citation = 'citation';
|
|||
|
|
case Reference = 'reference';
|
|||
|
|
case Email = 'email';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum KeywordAlgorithm: string
|
|||
|
|
{
|
|||
|
|
case Yake = 'Yake';
|
|||
|
|
case Rake = 'Rake';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum PSMMode: string
|
|||
|
|
{
|
|||
|
|
case OsdOnly = 'OsdOnly';
|
|||
|
|
case AutoOsd = 'AutoOsd';
|
|||
|
|
case AutoOnly = 'AutoOnly';
|
|||
|
|
case Auto = 'Auto';
|
|||
|
|
case SingleColumn = 'SingleColumn';
|
|||
|
|
case SingleBlockVertical = 'SingleBlockVertical';
|
|||
|
|
case SingleBlock = 'SingleBlock';
|
|||
|
|
case SingleLine = 'SingleLine';
|
|||
|
|
case SingleWord = 'SingleWord';
|
|||
|
|
case CircleWord = 'CircleWord';
|
|||
|
|
case SingleChar = 'SingleChar';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum PaddleLanguage: string
|
|||
|
|
{
|
|||
|
|
case English = 'English';
|
|||
|
|
case Chinese = 'Chinese';
|
|||
|
|
case Japanese = 'Japanese';
|
|||
|
|
case Korean = 'Korean';
|
|||
|
|
case German = 'German';
|
|||
|
|
case French = 'French';
|
|||
|
|
case Latin = 'Latin';
|
|||
|
|
case Cyrillic = 'Cyrillic';
|
|||
|
|
case TraditionalChinese = 'TraditionalChinese';
|
|||
|
|
case Thai = 'Thai';
|
|||
|
|
case Greek = 'Greek';
|
|||
|
|
case EastSlavic = 'EastSlavic';
|
|||
|
|
case Arabic = 'Arabic';
|
|||
|
|
case Devanagari = 'Devanagari';
|
|||
|
|
case Tamil = 'Tamil';
|
|||
|
|
case Telugu = 'Telugu';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
enum LayoutClass: string
|
|||
|
|
{
|
|||
|
|
case Caption = 'caption';
|
|||
|
|
case Footnote = 'footnote';
|
|||
|
|
case Formula = 'formula';
|
|||
|
|
case ListItem = 'list_item';
|
|||
|
|
case PageFooter = 'page_footer';
|
|||
|
|
case PageHeader = 'page_header';
|
|||
|
|
case Picture = 'picture';
|
|||
|
|
case SectionHeader = 'section_header';
|
|||
|
|
case Table = 'table';
|
|||
|
|
case Text = 'text';
|
|||
|
|
case Title = 'title';
|
|||
|
|
case DocumentIndex = 'document_index';
|
|||
|
|
case Code = 'code';
|
|||
|
|
case CheckboxSelected = 'checkbox_selected';
|
|||
|
|
case CheckboxUnselected = 'checkbox_unselected';
|
|||
|
|
case Form = 'form';
|
|||
|
|
case KeyValueRegion = 'key_value_region';
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
class KreuzbergApi
|
|||
|
|
{
|
|||
|
|
public static function extractBytes(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function extractFile(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function extractFileSync(string $path, ?string $mime_type = null, ?\Kreuzberg\ExtractionConfig $config = null): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function extractBytesSync(string $content, string $mime_type, \Kreuzberg\ExtractionConfig $config): \Kreuzberg\ExtractionResult { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<\Kreuzberg\BatchFileItem> $items
|
|||
|
|
* @param \Kreuzberg\ExtractionConfig $config
|
|||
|
|
* @return array<\Kreuzberg\ExtractionResult>
|
|||
|
|
*/
|
|||
|
|
public static function batchExtractFilesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<\Kreuzberg\BatchBytesItem> $items
|
|||
|
|
* @param \Kreuzberg\ExtractionConfig $config
|
|||
|
|
* @return array<\Kreuzberg\ExtractionResult>
|
|||
|
|
*/
|
|||
|
|
public static function batchExtractBytesSync(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<\Kreuzberg\BatchFileItem> $items
|
|||
|
|
* @param \Kreuzberg\ExtractionConfig $config
|
|||
|
|
* @return array<\Kreuzberg\ExtractionResult>
|
|||
|
|
*/
|
|||
|
|
public static function batchExtractFiles(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<\Kreuzberg\BatchBytesItem> $items
|
|||
|
|
* @param \Kreuzberg\ExtractionConfig $config
|
|||
|
|
* @return array<\Kreuzberg\ExtractionResult>
|
|||
|
|
*/
|
|||
|
|
public static function batchExtractBytes(array $items, \Kreuzberg\ExtractionConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function detectMimeTypeFromBytes(string $content): string { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param string $mime_type
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function getExtensionsForMime(string $mime_type): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearEmbeddingBackends(): void { }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listEmbeddingBackends(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listDocumentExtractors(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearDocumentExtractors(): void { }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listOcrBackends(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearOcrBackends(): void { }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listPostProcessors(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearPostProcessors(): void { }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listRenderers(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearRenderers(): void { }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listValidators(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function clearValidators(): void { }
|
|||
|
|
public static function compare(\Kreuzberg\ExtractionResult $a, \Kreuzberg\ExtractionResult $b, \Kreuzberg\DiffOptions $opts): \Kreuzberg\ExtractionDiff { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<string> $texts
|
|||
|
|
* @param \Kreuzberg\EmbeddingConfig $config
|
|||
|
|
* @return array<array<float>>
|
|||
|
|
*/
|
|||
|
|
public static function embedTextsAsync(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function renderPdfPageToPng(string $pdf_bytes, int $page_index, ?int $dpi = null, ?string $password = null): string { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function detectMimeType(string $path, bool $check_exists): string { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @param array<string> $texts
|
|||
|
|
* @param \Kreuzberg\EmbeddingConfig $config
|
|||
|
|
* @return array<array<float>>
|
|||
|
|
*/
|
|||
|
|
public static function embedTexts(array $texts, \Kreuzberg\EmbeddingConfig $config): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function getEmbeddingPreset(string $name): ?\Kreuzberg\EmbeddingPreset { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
/**
|
|||
|
|
* @return array<string>
|
|||
|
|
*/
|
|||
|
|
public static function listEmbeddingPresets(): array { throw new \RuntimeException('Not implemented.'); }
|
|||
|
|
public static function registerOcrBackend(\Kreuzberg\OcrBackend $backend): void { }
|
|||
|
|
public static function unregisterOcrBackend(string $name): void { }
|
|||
|
|
public static function clearOcrBackends(): void { }
|
|||
|
|
public static function registerPostProcessor(\Kreuzberg\PostProcessor $backend): void { }
|
|||
|
|
public static function unregisterPostProcessor(string $name): void { }
|
|||
|
|
public static function clearPostProcessors(): void { }
|
|||
|
|
public static function registerValidator(\Kreuzberg\Validator $backend): void { }
|
|||
|
|
public static function unregisterValidator(string $name): void { }
|
|||
|
|
public static function clearValidators(): void { }
|
|||
|
|
public static function registerEmbeddingBackend(\Kreuzberg\EmbeddingBackend $backend): void { }
|
|||
|
|
public static function unregisterEmbeddingBackend(string $name): void { }
|
|||
|
|
public static function clearEmbeddingBackends(): void { }
|
|||
|
|
public static function registerDocumentExtractor(\Kreuzberg\DocumentExtractor $backend): void { }
|
|||
|
|
public static function unregisterDocumentExtractor(string $name): void { }
|
|||
|
|
public static function clearDocumentExtractors(): void { }
|
|||
|
|
public static function registerRenderer(\Kreuzberg\Renderer $backend): void { }
|
|||
|
|
public static function unregisterRenderer(string $name): void { }
|
|||
|
|
public static function clearRenderers(): void { }
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
} // end namespace
|