Files
fil/packages/swift/Sources/Kreuzberg/Kreuzberg.swift

8478 lines
355 KiB
Swift
Raw Normal View History

2026-06-01 23:40:55 +02:00
// Generated by alef. Do not edit by hand.
// swift-format-ignore-file
import Foundation
import RustBridge
public struct CacheStats: Codable, Sendable, Hashable {
public let totalFiles: UInt
public let totalSizeMb: Double
public let availableSpaceMb: Double
public let oldestFileAgeDays: Double
public let newestFileAgeDays: Double
public init(totalFiles: UInt, totalSizeMb: Double, availableSpaceMb: Double, oldestFileAgeDays: Double, newestFileAgeDays: Double) {
self.totalFiles = totalFiles
self.totalSizeMb = totalSizeMb
self.availableSpaceMb = availableSpaceMb
self.oldestFileAgeDays = oldestFileAgeDays
self.newestFileAgeDays = newestFileAgeDays
}
private enum CodingKeys: String, CodingKey {
case totalFiles = "total_files"
case totalSizeMb = "total_size_mb"
case availableSpaceMb = "available_space_mb"
case oldestFileAgeDays = "oldest_file_age_days"
case newestFileAgeDays = "newest_file_age_days"
}
}
// MARK: - Internal FFI conversions for CacheStats
internal extension CacheStats {
init(_ rb: RustBridge.CacheStatsRef) throws {
self.totalFiles = rb.totalFiles()
self.totalSizeMb = rb.totalSizeMb()
self.availableSpaceMb = rb.availableSpaceMb()
self.oldestFileAgeDays = rb.oldestFileAgeDays()
self.newestFileAgeDays = rb.newestFileAgeDays()
}
func intoRust() throws -> RustBridge.CacheStats {
return RustBridge.CacheStats(self.totalFiles, self.totalSizeMb, self.availableSpaceMb, self.oldestFileAgeDays, self.newestFileAgeDays)
}
}
/// Hardware acceleration configuration for ONNX Runtime models.
///
/// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
/// for inference in layout detection and embedding generation.
///
/// # Example
///
/// ```rust
/// use kreuzberg::AccelerationConfig;
///
/// // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
/// let config = AccelerationConfig::default();
///
/// // Force CPU only
/// let config = AccelerationConfig {
/// provider: kreuzberg::ExecutionProviderType::Cpu,
/// ..Default::default()
/// };
/// ```
public struct AccelerationConfig: Codable, Sendable, Hashable {
/// Execution provider to use for ONNX inference.
public let provider: ExecutionProviderType
/// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
public let deviceId: UInt32
public init(provider: ExecutionProviderType, deviceId: UInt32) {
self.provider = provider
self.deviceId = deviceId
}
private enum CodingKeys: String, CodingKey {
case provider = "provider"
case deviceId = "device_id"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.provider = try container.decode(ExecutionProviderType.self, forKey: .provider)
self.deviceId = try container.decodeIfPresent(UInt32.self, forKey: .deviceId) ?? 0
}
}
// MARK: - Internal FFI conversions for AccelerationConfig
internal extension AccelerationConfig {
init(_ rb: RustBridge.AccelerationConfigRef) throws {
self.provider = ExecutionProviderType(rawValue: rb.provider().toString()) ?? { fatalError("Unknown ExecutionProviderType: \(rb.provider().toString())") }()
self.deviceId = rb.deviceId()
}
func intoRust() throws -> RustBridge.AccelerationConfig {
return RustBridge.AccelerationConfig(try self.provider.intoRust(), self.deviceId)
}
}
/// Cross-extractor content filtering configuration.
///
/// Controls whether "furniture" content (headers, footers, page numbers,
/// watermarks, repeating text) is included in or stripped from extraction
/// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
/// with format-specific implementation.
///
/// When `None` on `ExtractionConfig`, each extractor uses its current
/// default behavior unchanged.
public struct ContentFilterConfig: Codable, Sendable, Hashable {
/// Include running headers in extraction output.
///
/// - PDF: Disables top-margin furniture stripping and prevents the layout
/// model from treating `PageHeader`-classified regions as furniture.
/// - DOCX: Includes document headers in text output.
/// - RTF/ODT: Headers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `<header>` element content.
///
/// Default: `false` (headers are stripped or excluded).
public let includeHeaders: Bool
/// Include running footers in extraction output.
///
/// - PDF: Disables bottom-margin furniture stripping and prevents the layout
/// model from treating `PageFooter`-classified regions as furniture.
/// - DOCX: Includes document footers in text output.
/// - RTF/ODT: Footers already included; this is a no-op when true.
/// - HTML/EPUB: Keeps `<footer>` element content.
///
/// Default: `false` (footers are stripped or excluded).
public let includeFooters: Bool
/// Enable the heuristic cross-page repeating text detector.
///
/// When `true` (default), text that repeats verbatim across a supermajority
/// of pages is classified as furniture and stripped. Disable this if brand
/// names or repeated headings are being incorrectly removed by the heuristic.
///
/// Note: when a layout-detection model is active, the model may independently
/// classify page-header / page-footer regions as furniture on a per-page basis.
/// To preserve those regions, set `include_headers = true`, `include_footers = true`,
/// or both, in addition to disabling this flag.
///
/// Primarily affects PDF extraction.
///
/// Default: `true`.
public let stripRepeatingText: Bool
/// Include watermark text in extraction output.
///
/// - PDF: Keeps watermark artifacts and arXiv identifiers.
/// - Other formats: No effect currently.
///
/// Default: `false` (watermarks are stripped).
public let includeWatermarks: Bool
public init(includeHeaders: Bool, includeFooters: Bool, stripRepeatingText: Bool, includeWatermarks: Bool) {
self.includeHeaders = includeHeaders
self.includeFooters = includeFooters
self.stripRepeatingText = stripRepeatingText
self.includeWatermarks = includeWatermarks
}
private enum CodingKeys: String, CodingKey {
case includeHeaders = "include_headers"
case includeFooters = "include_footers"
case stripRepeatingText = "strip_repeating_text"
case includeWatermarks = "include_watermarks"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.includeHeaders = try container.decodeIfPresent(Bool.self, forKey: .includeHeaders) ?? false
self.includeFooters = try container.decodeIfPresent(Bool.self, forKey: .includeFooters) ?? false
self.stripRepeatingText = try container.decodeIfPresent(Bool.self, forKey: .stripRepeatingText) ?? true
self.includeWatermarks = try container.decodeIfPresent(Bool.self, forKey: .includeWatermarks) ?? false
}
}
// MARK: - Internal FFI conversions for ContentFilterConfig
internal extension ContentFilterConfig {
init(_ rb: RustBridge.ContentFilterConfigRef) throws {
self.includeHeaders = rb.includeHeaders()
self.includeFooters = rb.includeFooters()
self.stripRepeatingText = rb.stripRepeatingText()
self.includeWatermarks = rb.includeWatermarks()
}
func intoRust() throws -> RustBridge.ContentFilterConfig {
return RustBridge.ContentFilterConfig(self.includeHeaders, self.includeFooters, self.stripRepeatingText, self.includeWatermarks)
}
}
/// Configuration for email extraction.
public struct EmailConfig: Codable, Sendable, Hashable {
/// Windows codepage number to use when an MSG file contains no codepage property.
/// Defaults to `None`, which falls back to windows-1252.
///
/// If an unrecognized or invalid codepage number is supplied (including 0),
/// the behavior silently falls back to windows-1252 the same as when the
/// MSG file itself contains an unrecognized codepage. No error or warning is
/// emitted. Users should verify output when supplying unusual values.
///
/// Common values:
/// - 1250: Central European (Polish, Czech, Hungarian, etc.)
/// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
/// - 1252: Western European (default)
/// - 1253: Greek
/// - 1254: Turkish
/// - 1255: Hebrew
/// - 1256: Arabic
/// - 932: Japanese (Shift-JIS)
/// - 936: Simplified Chinese (GBK)
public let msgFallbackCodepage: UInt32?
public init(msgFallbackCodepage: UInt32? = nil) {
self.msgFallbackCodepage = msgFallbackCodepage
}
private enum CodingKeys: String, CodingKey {
case msgFallbackCodepage = "msg_fallback_codepage"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.msgFallbackCodepage = try container.decodeIfPresent(UInt32.self, forKey: .msgFallbackCodepage) ?? nil
}
}
// MARK: - Internal FFI conversions for EmailConfig
internal extension EmailConfig {
init(_ rb: RustBridge.EmailConfigRef) throws {
self.msgFallbackCodepage = rb.msgFallbackCodepage()
}
func intoRust() throws -> RustBridge.EmailConfig {
return RustBridge.EmailConfig(self.msgFallbackCodepage)
}
}
/// Main extraction configuration.
///
/// This struct contains all configuration options for the extraction process.
/// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
///
/// # Example
///
/// ```rust
/// use kreuzberg::core::config::ExtractionConfig;
///
/// // Create with defaults
/// let config = ExtractionConfig::default();
///
/// // Load from TOML file
/// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
/// ```
public typealias ExtractionConfig = RustBridge.ExtractionConfig
/// Per-file extraction configuration overrides for batch processing.
///
/// All fields are `Option<T>` `None` means "use the batch-level default."
/// This type is used with `batch_extract_files` and
/// `batch_extract_bytes` to allow heterogeneous
/// extraction settings within a single batch.
///
/// # Excluded Fields
///
/// The following `ExtractionConfig` fields are batch-level only and
/// cannot be overridden per file:
/// - `max_concurrent_extractions` controls batch parallelism
/// - `use_cache` global caching policy
/// - `acceleration` shared ONNX execution provider
/// - `security_limits` global archive security policy
///
/// # Example
///
/// ```rust
/// use kreuzberg::FileExtractionConfig;
///
/// // Override just OCR forcing for a specific file
/// let config = FileExtractionConfig {
/// force_ocr: Some(true),
/// ..Default::default()
/// };
/// ```
public typealias FileExtractionConfig = RustBridge.FileExtractionConfig
/// Batch item for byte array extraction.
///
/// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
/// to represent a single item in a batch extraction job.
public typealias BatchBytesItem = RustBridge.BatchBytesItem
/// Batch item for file extraction.
///
/// Used with `batch_extract_files` and `batch_extract_files_sync`
/// to represent a single file in a batch extraction job.
public typealias BatchFileItem = RustBridge.BatchFileItem
/// Image extraction configuration.
public struct ImageExtractionConfig: Codable, Sendable, Hashable {
/// Extract images from documents
public let extractImages: Bool
/// Target DPI for image normalization
public let targetDpi: Int32
/// Maximum dimension for images (width or height)
public let maxImageDimension: Int32
/// Whether to inject image reference placeholders into markdown output.
/// When `true` (default), image references like `![Image 1](embedded:p1_i0)`
/// are appended to the markdown. Set to `false` to extract images as data
/// without polluting the markdown output.
public let injectPlaceholders: Bool
/// Automatically adjust DPI based on image content
public let autoAdjustDpi: Bool
/// Minimum DPI threshold
public let minDpi: Int32
/// Maximum DPI threshold
public let maxDpi: Int32
/// Maximum number of image objects to extract per PDF page.
///
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
/// can trigger extremely long or indefinite extraction times when every image
/// object on a dense page is decoded individually via the PDF extractor. Setting this
/// limit causes kreuzberg to stop collecting individual images once the count
/// per page reaches the cap and emit a warning instead.
///
/// `None` (default) means no limit all images are extracted.
public let maxImagesPerPage: UInt32?
/// When `true` (default), extracted images are classified by kind and grouped
/// into clusters where they appear to belong to one figure.
public let classify: Bool
/// When `true`, full-page renders produced during OCR preprocessing are captured
/// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
///
/// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
/// document-level OCR bypass is active (whole-document backend). When OCR is
/// enabled and this flag is set but the active backend skips per-page rendering,
/// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
///
/// Defaults to `false`. Enable when downstream consumers need page thumbnails
/// (e.g. citation previews, visual grounding).
public let includePageRasters: Bool
/// Run OCR on extracted images and include the recognized text in the document content.
///
/// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
/// are processed with the configured OCR backend. Set to `false` to extract images
/// without OCR processing, even when OCR is enabled.
public let runOcrOnImages: Bool
/// When `true`, image OCR results are rendered as plain text without the
/// `![...](...)` markdown placeholder. Only takes effect when `run_ocr_on_images`
/// is also `true`.
public let ocrTextOnly: Bool
/// When `true` and `ocr_text_only` is `false`, append the OCR text after
/// the image placeholder in the rendered output.
public let appendOcrText: Bool
public init(extractImages: Bool, targetDpi: Int32, maxImageDimension: Int32, injectPlaceholders: Bool, autoAdjustDpi: Bool, minDpi: Int32, maxDpi: Int32, maxImagesPerPage: UInt32? = nil, classify: Bool, includePageRasters: Bool, runOcrOnImages: Bool, ocrTextOnly: Bool, appendOcrText: Bool) {
self.extractImages = extractImages
self.targetDpi = targetDpi
self.maxImageDimension = maxImageDimension
self.injectPlaceholders = injectPlaceholders
self.autoAdjustDpi = autoAdjustDpi
self.minDpi = minDpi
self.maxDpi = maxDpi
self.maxImagesPerPage = maxImagesPerPage
self.classify = classify
self.includePageRasters = includePageRasters
self.runOcrOnImages = runOcrOnImages
self.ocrTextOnly = ocrTextOnly
self.appendOcrText = appendOcrText
}
private enum CodingKeys: String, CodingKey {
case extractImages = "extract_images"
case targetDpi = "target_dpi"
case maxImageDimension = "max_image_dimension"
case injectPlaceholders = "inject_placeholders"
case autoAdjustDpi = "auto_adjust_dpi"
case minDpi = "min_dpi"
case maxDpi = "max_dpi"
case maxImagesPerPage = "max_images_per_page"
case classify = "classify"
case includePageRasters = "include_page_rasters"
case runOcrOnImages = "run_ocr_on_images"
case ocrTextOnly = "ocr_text_only"
case appendOcrText = "append_ocr_text"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.extractImages = try container.decodeIfPresent(Bool.self, forKey: .extractImages) ?? true
self.targetDpi = try container.decodeIfPresent(Int32.self, forKey: .targetDpi) ?? 300
self.maxImageDimension = try container.decodeIfPresent(Int32.self, forKey: .maxImageDimension) ?? 4096
self.injectPlaceholders = try container.decodeIfPresent(Bool.self, forKey: .injectPlaceholders) ?? true
self.autoAdjustDpi = try container.decodeIfPresent(Bool.self, forKey: .autoAdjustDpi) ?? true
self.minDpi = try container.decodeIfPresent(Int32.self, forKey: .minDpi) ?? 72
self.maxDpi = try container.decodeIfPresent(Int32.self, forKey: .maxDpi) ?? 600
self.maxImagesPerPage = try container.decodeIfPresent(UInt32.self, forKey: .maxImagesPerPage) ?? nil
self.classify = try container.decodeIfPresent(Bool.self, forKey: .classify) ?? true
self.includePageRasters = try container.decodeIfPresent(Bool.self, forKey: .includePageRasters) ?? false
self.runOcrOnImages = try container.decodeIfPresent(Bool.self, forKey: .runOcrOnImages) ?? true
self.ocrTextOnly = try container.decodeIfPresent(Bool.self, forKey: .ocrTextOnly) ?? false
self.appendOcrText = try container.decodeIfPresent(Bool.self, forKey: .appendOcrText) ?? false
}
}
// MARK: - Internal FFI conversions for ImageExtractionConfig
internal extension ImageExtractionConfig {
init(_ rb: RustBridge.ImageExtractionConfigRef) throws {
self.extractImages = rb.extractImages()
self.targetDpi = rb.targetDpi()
self.maxImageDimension = rb.maxImageDimension()
self.injectPlaceholders = rb.injectPlaceholders()
self.autoAdjustDpi = rb.autoAdjustDpi()
self.minDpi = rb.minDpi()
self.maxDpi = rb.maxDpi()
self.maxImagesPerPage = rb.maxImagesPerPage()
self.classify = rb.classify()
self.includePageRasters = rb.includePageRasters()
self.runOcrOnImages = rb.runOcrOnImages()
self.ocrTextOnly = rb.ocrTextOnly()
self.appendOcrText = rb.appendOcrText()
}
func intoRust() throws -> RustBridge.ImageExtractionConfig {
return RustBridge.ImageExtractionConfig(self.extractImages, self.targetDpi, self.maxImageDimension, self.injectPlaceholders, self.autoAdjustDpi, self.minDpi, self.maxDpi, self.maxImagesPerPage, self.classify, self.includePageRasters, self.runOcrOnImages, self.ocrTextOnly, self.appendOcrText)
}
}
/// Token reduction configuration.
public struct TokenReductionOptions: Codable, Sendable, Hashable {
/// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
public let mode: String
/// Preserve important words (capitalized, technical terms)
public let preserveImportantWords: Bool
public init(mode: String, preserveImportantWords: Bool) {
self.mode = mode
self.preserveImportantWords = preserveImportantWords
}
private enum CodingKeys: String, CodingKey {
case mode = "mode"
case preserveImportantWords = "preserve_important_words"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.mode = try container.decodeIfPresent(String.self, forKey: .mode) ?? ""
self.preserveImportantWords = try container.decodeIfPresent(Bool.self, forKey: .preserveImportantWords) ?? true
}
}
// MARK: - Internal FFI conversions for TokenReductionOptions
internal extension TokenReductionOptions {
init(_ rb: RustBridge.TokenReductionOptionsRef) throws {
self.mode = rb.mode().toString()
self.preserveImportantWords = rb.preserveImportantWords()
}
func intoRust() throws -> RustBridge.TokenReductionOptions {
return RustBridge.TokenReductionOptions(RustString(self.mode), self.preserveImportantWords)
}
}
/// Language detection configuration.
public struct LanguageDetectionConfig: Codable, Sendable, Hashable {
/// Enable language detection
public let enabled: Bool
/// Minimum confidence threshold (0.0-1.0)
public let minConfidence: Double
/// Detect multiple languages in the document
public let detectMultiple: Bool
public init(enabled: Bool, minConfidence: Double, detectMultiple: Bool) {
self.enabled = enabled
self.minConfidence = minConfidence
self.detectMultiple = detectMultiple
}
private enum CodingKeys: String, CodingKey {
case enabled = "enabled"
case minConfidence = "min_confidence"
case detectMultiple = "detect_multiple"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0.8
self.detectMultiple = try container.decodeIfPresent(Bool.self, forKey: .detectMultiple) ?? false
}
}
// MARK: - Internal FFI conversions for LanguageDetectionConfig
internal extension LanguageDetectionConfig {
init(_ rb: RustBridge.LanguageDetectionConfigRef) throws {
self.enabled = rb.enabled()
self.minConfidence = rb.minConfidence()
self.detectMultiple = rb.detectMultiple()
}
func intoRust() throws -> RustBridge.LanguageDetectionConfig {
return RustBridge.LanguageDetectionConfig(self.enabled, self.minConfidence, self.detectMultiple)
}
}
/// Configuration for styled HTML output.
///
/// When set on [`ExtractionConfig::html_output`] alongside
/// `output_format = OutputFormat::Html`, the pipeline builds a
/// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
/// the plain comrak-based renderer.
///
/// # Example
///
/// ```rust
/// use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
///
/// let config = HtmlOutputConfig {
/// theme: HtmlTheme::GitHub,
/// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
/// ..Default::default()
/// };
/// ```
public typealias HtmlOutputConfig = RustBridge.HtmlOutputConfig
/// Layout detection configuration.
///
/// Controls layout detection behavior in the extraction pipeline.
/// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
/// is enabled for PDF extraction.
public struct LayoutDetectionConfig: Codable, Sendable, Hashable {
/// Confidence threshold override (None = use model default).
public let confidenceThreshold: Float?
/// Whether to apply postprocessing heuristics (default: true).
public let applyHeuristics: Bool
/// Table structure recognition model.
///
/// Controls which model is used for table cell detection within layout-detected
/// table regions. Defaults to [`TableModel::Tatr`].
public let tableModel: TableModel
/// Hardware acceleration for ONNX models (layout detection + table structure).
///
/// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
/// is used for inference. Defaults to `None` (auto-select per platform).
public let acceleration: AccelerationConfig?
public init(confidenceThreshold: Float? = nil, applyHeuristics: Bool, tableModel: TableModel, acceleration: AccelerationConfig? = nil) {
self.confidenceThreshold = confidenceThreshold
self.applyHeuristics = applyHeuristics
self.tableModel = tableModel
self.acceleration = acceleration
}
private enum CodingKeys: String, CodingKey {
case confidenceThreshold = "confidence_threshold"
case applyHeuristics = "apply_heuristics"
case tableModel = "table_model"
case acceleration = "acceleration"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.confidenceThreshold = try container.decodeIfPresent(Float.self, forKey: .confidenceThreshold) ?? nil
self.applyHeuristics = try container.decodeIfPresent(Bool.self, forKey: .applyHeuristics) ?? true
self.tableModel = try container.decode(TableModel.self, forKey: .tableModel)
self.acceleration = try container.decodeIfPresent(AccelerationConfig.self, forKey: .acceleration) ?? nil
}
}
// MARK: - Internal FFI conversions for LayoutDetectionConfig
internal extension LayoutDetectionConfig {
init(_ rb: RustBridge.LayoutDetectionConfigRef) throws {
self.confidenceThreshold = rb.confidenceThreshold()
self.applyHeuristics = rb.applyHeuristics()
self.tableModel = TableModel(rawValue: rb.tableModel().toString()) ?? { fatalError("Unknown TableModel: \(rb.tableModel().toString())") }()
self.acceleration = try rb.acceleration().map { try AccelerationConfig($0) }
}
func intoRust() throws -> RustBridge.LayoutDetectionConfig {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.layoutDetectionConfigFromJson(json)
}
}
/// Configuration for an LLM provider/model via liter-llm.
///
/// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
/// its own `LlmConfig`, allowing different providers per feature.
///
/// # Example
///
/// ```toml
/// [structured_extraction.llm]
/// model = "openai/gpt-4o"
/// api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
/// ```
public struct LlmConfig: Codable, Sendable, Hashable {
/// Provider/model string using liter-llm routing format.
///
/// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
/// `"groq/llama-3.1-70b-versatile"`.
public let model: String
/// API key for the provider. When `None`, liter-llm falls back to
/// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
public let apiKey: String?
/// Custom base URL override for the provider endpoint.
public let baseUrl: String?
/// Request timeout in seconds (default: 60).
public let timeoutSecs: UInt64?
/// Maximum retry attempts (default: 3).
public let maxRetries: UInt32?
/// Sampling temperature for generation tasks.
public let temperature: Double?
/// Maximum tokens to generate.
public let maxTokens: UInt64?
public init(model: String, apiKey: String? = nil, baseUrl: String? = nil, timeoutSecs: UInt64? = nil, maxRetries: UInt32? = nil, temperature: Double? = nil, maxTokens: UInt64? = nil) {
self.model = model
self.apiKey = apiKey
self.baseUrl = baseUrl
self.timeoutSecs = timeoutSecs
self.maxRetries = maxRetries
self.temperature = temperature
self.maxTokens = maxTokens
}
private enum CodingKeys: String, CodingKey {
case model = "model"
case apiKey = "api_key"
case baseUrl = "base_url"
case timeoutSecs = "timeout_secs"
case maxRetries = "max_retries"
case temperature = "temperature"
case maxTokens = "max_tokens"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.model = try container.decodeIfPresent(String.self, forKey: .model) ?? ""
self.apiKey = try container.decodeIfPresent(String.self, forKey: .apiKey) ?? nil
self.baseUrl = try container.decodeIfPresent(String.self, forKey: .baseUrl) ?? nil
self.timeoutSecs = try container.decodeIfPresent(UInt64.self, forKey: .timeoutSecs) ?? nil
self.maxRetries = try container.decodeIfPresent(UInt32.self, forKey: .maxRetries) ?? nil
self.temperature = try container.decodeIfPresent(Double.self, forKey: .temperature) ?? nil
self.maxTokens = try container.decodeIfPresent(UInt64.self, forKey: .maxTokens) ?? nil
}
}
// MARK: - Internal FFI conversions for LlmConfig
internal extension LlmConfig {
init(_ rb: RustBridge.LlmConfigRef) throws {
self.model = rb.model().toString()
self.apiKey = rb.apiKey()?.toString()
self.baseUrl = rb.baseUrl()?.toString()
self.timeoutSecs = rb.timeoutSecs()
self.maxRetries = rb.maxRetries()
self.temperature = rb.temperature()
self.maxTokens = rb.maxTokens()
}
func intoRust() throws -> RustBridge.LlmConfig {
return RustBridge.LlmConfig(RustString(self.model), self.apiKey.map(RustString.init), self.baseUrl.map(RustString.init), self.timeoutSecs, self.maxRetries, self.temperature, self.maxTokens)
}
}
/// Configuration for LLM-based structured data extraction.
///
/// Sends extracted document content to a VLM with a JSON schema,
/// returning structured data that conforms to the schema.
///
/// # Example
///
/// ```toml
/// [structured_extraction]
/// schema_name = "invoice_data"
/// strict = true
///
/// [structured_extraction.schema]
/// type = "object"
/// properties.vendor = { type = "string" }
/// properties.total = { type = "number" }
/// required = ["vendor", "total"]
///
/// [structured_extraction.llm]
/// model = "openai/gpt-4o"
/// ```
public typealias StructuredExtractionConfig = RustBridge.StructuredExtractionConfig
/// Quality thresholds for OCR fallback decisions and pipeline quality gating.
///
/// All fields default to the values that match the previous hardcoded behavior,
/// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
public struct OcrQualityThresholds: Codable, Sendable, Hashable {
/// Minimum total non-whitespace characters to consider text substantive.
public let minTotalNonWhitespace: UInt
/// Minimum non-whitespace characters per page on average.
public let minNonWhitespacePerPage: Double
/// Minimum character count for a word to be "meaningful".
public let minMeaningfulWordLen: UInt
/// Minimum count of meaningful words before text is accepted.
public let minMeaningfulWords: UInt
/// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
public let minAlnumRatio: Double
/// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
public let minGarbageChars: UInt
/// Maximum fraction of short (1-2 char) words before text is considered fragmented.
public let maxFragmentedWordRatio: Double
/// Critical fragmentation threshold triggers OCR regardless of meaningful words.
/// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
public let criticalFragmentedWordRatio: Double
/// Minimum average word length. Below this with enough words indicates garbled extraction.
public let minAvgWordLength: Double
/// Minimum word count before average word length check applies.
public let minWordsForAvgLengthCheck: UInt
/// Minimum consecutive word repetition ratio to detect column scrambling.
public let minConsecutiveRepeatRatio: Double
/// Minimum word count before consecutive repetition check is applied.
public let minWordsForRepeatCheck: UInt
/// Minimum character count for "substantive markdown" OCR skip gate.
public let substantiveMinChars: UInt
/// Minimum character count for "non-text content" OCR skip gate.
public let nonTextMinChars: UInt
/// Alphanumeric+whitespace ratio threshold for skip decisions.
public let alnumWsRatioThreshold: Double
/// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
/// If the result from a backend scores below this, try the next backend.
public let pipelineMinQuality: Double
public init(minTotalNonWhitespace: UInt, minNonWhitespacePerPage: Double, minMeaningfulWordLen: UInt, minMeaningfulWords: UInt, minAlnumRatio: Double, minGarbageChars: UInt, maxFragmentedWordRatio: Double, criticalFragmentedWordRatio: Double, minAvgWordLength: Double, minWordsForAvgLengthCheck: UInt, minConsecutiveRepeatRatio: Double, minWordsForRepeatCheck: UInt, substantiveMinChars: UInt, nonTextMinChars: UInt, alnumWsRatioThreshold: Double, pipelineMinQuality: Double) {
self.minTotalNonWhitespace = minTotalNonWhitespace
self.minNonWhitespacePerPage = minNonWhitespacePerPage
self.minMeaningfulWordLen = minMeaningfulWordLen
self.minMeaningfulWords = minMeaningfulWords
self.minAlnumRatio = minAlnumRatio
self.minGarbageChars = minGarbageChars
self.maxFragmentedWordRatio = maxFragmentedWordRatio
self.criticalFragmentedWordRatio = criticalFragmentedWordRatio
self.minAvgWordLength = minAvgWordLength
self.minWordsForAvgLengthCheck = minWordsForAvgLengthCheck
self.minConsecutiveRepeatRatio = minConsecutiveRepeatRatio
self.minWordsForRepeatCheck = minWordsForRepeatCheck
self.substantiveMinChars = substantiveMinChars
self.nonTextMinChars = nonTextMinChars
self.alnumWsRatioThreshold = alnumWsRatioThreshold
self.pipelineMinQuality = pipelineMinQuality
}
private enum CodingKeys: String, CodingKey {
case minTotalNonWhitespace = "min_total_non_whitespace"
case minNonWhitespacePerPage = "min_non_whitespace_per_page"
case minMeaningfulWordLen = "min_meaningful_word_len"
case minMeaningfulWords = "min_meaningful_words"
case minAlnumRatio = "min_alnum_ratio"
case minGarbageChars = "min_garbage_chars"
case maxFragmentedWordRatio = "max_fragmented_word_ratio"
case criticalFragmentedWordRatio = "critical_fragmented_word_ratio"
case minAvgWordLength = "min_avg_word_length"
case minWordsForAvgLengthCheck = "min_words_for_avg_length_check"
case minConsecutiveRepeatRatio = "min_consecutive_repeat_ratio"
case minWordsForRepeatCheck = "min_words_for_repeat_check"
case substantiveMinChars = "substantive_min_chars"
case nonTextMinChars = "non_text_min_chars"
case alnumWsRatioThreshold = "alnum_ws_ratio_threshold"
case pipelineMinQuality = "pipeline_min_quality"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.minTotalNonWhitespace = try container.decodeIfPresent(UInt.self, forKey: .minTotalNonWhitespace) ?? 64
self.minNonWhitespacePerPage = try container.decodeIfPresent(Double.self, forKey: .minNonWhitespacePerPage) ?? 32.0
self.minMeaningfulWordLen = try container.decodeIfPresent(UInt.self, forKey: .minMeaningfulWordLen) ?? 4
self.minMeaningfulWords = try container.decodeIfPresent(UInt.self, forKey: .minMeaningfulWords) ?? 3
self.minAlnumRatio = try container.decodeIfPresent(Double.self, forKey: .minAlnumRatio) ?? 0.3
self.minGarbageChars = try container.decodeIfPresent(UInt.self, forKey: .minGarbageChars) ?? 5
self.maxFragmentedWordRatio = try container.decodeIfPresent(Double.self, forKey: .maxFragmentedWordRatio) ?? 0.6
self.criticalFragmentedWordRatio = try container.decodeIfPresent(Double.self, forKey: .criticalFragmentedWordRatio) ?? 0.8
self.minAvgWordLength = try container.decodeIfPresent(Double.self, forKey: .minAvgWordLength) ?? 2.0
self.minWordsForAvgLengthCheck = try container.decodeIfPresent(UInt.self, forKey: .minWordsForAvgLengthCheck) ?? 50
self.minConsecutiveRepeatRatio = try container.decodeIfPresent(Double.self, forKey: .minConsecutiveRepeatRatio) ?? 0.08
self.minWordsForRepeatCheck = try container.decodeIfPresent(UInt.self, forKey: .minWordsForRepeatCheck) ?? 50
self.substantiveMinChars = try container.decodeIfPresent(UInt.self, forKey: .substantiveMinChars) ?? 100
self.nonTextMinChars = try container.decodeIfPresent(UInt.self, forKey: .nonTextMinChars) ?? 20
self.alnumWsRatioThreshold = try container.decodeIfPresent(Double.self, forKey: .alnumWsRatioThreshold) ?? 0.4
self.pipelineMinQuality = try container.decodeIfPresent(Double.self, forKey: .pipelineMinQuality) ?? 0.5
}
}
// MARK: - Internal FFI conversions for OcrQualityThresholds
internal extension OcrQualityThresholds {
init(_ rb: RustBridge.OcrQualityThresholdsRef) throws {
self.minTotalNonWhitespace = rb.minTotalNonWhitespace()
self.minNonWhitespacePerPage = rb.minNonWhitespacePerPage()
self.minMeaningfulWordLen = rb.minMeaningfulWordLen()
self.minMeaningfulWords = rb.minMeaningfulWords()
self.minAlnumRatio = rb.minAlnumRatio()
self.minGarbageChars = rb.minGarbageChars()
self.maxFragmentedWordRatio = rb.maxFragmentedWordRatio()
self.criticalFragmentedWordRatio = rb.criticalFragmentedWordRatio()
self.minAvgWordLength = rb.minAvgWordLength()
self.minWordsForAvgLengthCheck = rb.minWordsForAvgLengthCheck()
self.minConsecutiveRepeatRatio = rb.minConsecutiveRepeatRatio()
self.minWordsForRepeatCheck = rb.minWordsForRepeatCheck()
self.substantiveMinChars = rb.substantiveMinChars()
self.nonTextMinChars = rb.nonTextMinChars()
self.alnumWsRatioThreshold = rb.alnumWsRatioThreshold()
self.pipelineMinQuality = rb.pipelineMinQuality()
}
func intoRust() throws -> RustBridge.OcrQualityThresholds {
return RustBridge.OcrQualityThresholds(self.minTotalNonWhitespace, self.minNonWhitespacePerPage, self.minMeaningfulWordLen, self.minMeaningfulWords, self.minAlnumRatio, self.minGarbageChars, self.maxFragmentedWordRatio, self.criticalFragmentedWordRatio, self.minAvgWordLength, self.minWordsForAvgLengthCheck, self.minConsecutiveRepeatRatio, self.minWordsForRepeatCheck, self.substantiveMinChars, self.nonTextMinChars, self.alnumWsRatioThreshold, self.pipelineMinQuality)
}
}
/// A single backend stage in the OCR pipeline.
public typealias OcrPipelineStage = RustBridge.OcrPipelineStage
/// Multi-backend OCR pipeline with quality-based fallback.
///
/// Backends are tried in priority order (highest first). After each backend
/// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
/// the result is accepted. Otherwise the next backend is tried.
public typealias OcrPipelineConfig = RustBridge.OcrPipelineConfig
/// OCR configuration.
public typealias OcrConfig = RustBridge.OcrConfig
/// Page extraction and tracking configuration.
///
/// Controls how pages are extracted, tracked, and represented in the extraction results.
/// When `None`, page tracking is disabled.
///
/// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
/// when page boundaries are available and chunking is configured.
public struct PageConfig: Codable, Sendable, Hashable {
/// Extract pages as separate array (ExtractionResult.pages)
public let extractPages: Bool
/// Insert page markers in main content string
public let insertPageMarkers: Bool
/// Page marker format (use {page_num} placeholder)
/// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
public let markerFormat: String
public init(extractPages: Bool, insertPageMarkers: Bool, markerFormat: String) {
self.extractPages = extractPages
self.insertPageMarkers = insertPageMarkers
self.markerFormat = markerFormat
}
private enum CodingKeys: String, CodingKey {
case extractPages = "extract_pages"
case insertPageMarkers = "insert_page_markers"
case markerFormat = "marker_format"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.extractPages = try container.decodeIfPresent(Bool.self, forKey: .extractPages) ?? false
self.insertPageMarkers = try container.decodeIfPresent(Bool.self, forKey: .insertPageMarkers) ?? false
self.markerFormat = try container.decodeIfPresent(String.self, forKey: .markerFormat) ?? "\n\n<!-- PAGE {page_num} -->\n\n"
}
}
// MARK: - Internal FFI conversions for PageConfig
internal extension PageConfig {
init(_ rb: RustBridge.PageConfigRef) throws {
self.extractPages = rb.extractPages()
self.insertPageMarkers = rb.insertPageMarkers()
self.markerFormat = rb.markerFormat().toString()
}
func intoRust() throws -> RustBridge.PageConfig {
return RustBridge.PageConfig(self.extractPages, self.insertPageMarkers, RustString(self.markerFormat))
}
}
/// PDF-specific configuration.
public struct PdfConfig: Codable, Sendable, Hashable {
/// Extract images from PDF
public let extractImages: Bool
/// Extract tables from PDF.
///
/// When `true` (default), runs pdf_oxide's native grid detector and, if it
/// finds nothing, falls back to the heuristic text-layer reconstruction in
/// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
/// both passes `tables` will then be empty in the result.
public let extractTables: Bool
/// List of passwords to try when opening encrypted PDFs
public let passwords: [String]?
/// Extract PDF metadata
public let extractMetadata: Bool
/// Hierarchy extraction configuration (None = hierarchy extraction disabled)
public let hierarchy: HierarchyConfig?
/// Extract PDF annotations (text notes, highlights, links, stamps).
/// Default: false
public let extractAnnotations: Bool
/// Top margin fraction (0.01.0) of page height to exclude headers/running heads.
/// Default: 0.06 (6%)
public let topMarginFraction: Float?
/// Bottom margin fraction (0.01.0) of page height to exclude footers/page numbers.
/// Default: 0.05 (5%)
public let bottomMarginFraction: Float?
/// Allow single-column pseudo tables in extraction results.
///
/// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
/// (heuristic) are rejected. When `true`, the minimum column count is relaxed
/// to 1, allowing single-column structured data (glossaries, itemized lists)
/// to be emitted as tables. Other quality filters (density, sparsity, prose
/// detection) still apply.
public let allowSingleColumnTables: Bool
/// Perform OCR on inline images extracted from PDF pages and attach the
/// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
/// to be available; if `ExtractionConfig.ocr` is `None` the extractor
/// falls back to `TesseractConfig::default()`. Per-image failures degrade
/// gracefully (the image is returned without OCR text rather than failing
/// the whole extraction). Default: `false`.
public let ocrInlineImages: Bool
public init(extractImages: Bool, extractTables: Bool, passwords: [String]? = nil, extractMetadata: Bool, hierarchy: HierarchyConfig? = nil, extractAnnotations: Bool, topMarginFraction: Float? = nil, bottomMarginFraction: Float? = nil, allowSingleColumnTables: Bool, ocrInlineImages: Bool) {
self.extractImages = extractImages
self.extractTables = extractTables
self.passwords = passwords
self.extractMetadata = extractMetadata
self.hierarchy = hierarchy
self.extractAnnotations = extractAnnotations
self.topMarginFraction = topMarginFraction
self.bottomMarginFraction = bottomMarginFraction
self.allowSingleColumnTables = allowSingleColumnTables
self.ocrInlineImages = ocrInlineImages
}
private enum CodingKeys: String, CodingKey {
case extractImages = "extract_images"
case extractTables = "extract_tables"
case passwords = "passwords"
case extractMetadata = "extract_metadata"
case hierarchy = "hierarchy"
case extractAnnotations = "extract_annotations"
case topMarginFraction = "top_margin_fraction"
case bottomMarginFraction = "bottom_margin_fraction"
case allowSingleColumnTables = "allow_single_column_tables"
case ocrInlineImages = "ocr_inline_images"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.extractImages = try container.decodeIfPresent(Bool.self, forKey: .extractImages) ?? false
self.extractTables = try container.decodeIfPresent(Bool.self, forKey: .extractTables) ?? true
self.passwords = try container.decodeIfPresent([String].self, forKey: .passwords) ?? nil
self.extractMetadata = try container.decodeIfPresent(Bool.self, forKey: .extractMetadata) ?? true
self.hierarchy = try container.decodeIfPresent(HierarchyConfig.self, forKey: .hierarchy) ?? nil
self.extractAnnotations = try container.decodeIfPresent(Bool.self, forKey: .extractAnnotations) ?? false
self.topMarginFraction = try container.decodeIfPresent(Float.self, forKey: .topMarginFraction) ?? nil
self.bottomMarginFraction = try container.decodeIfPresent(Float.self, forKey: .bottomMarginFraction) ?? nil
self.allowSingleColumnTables = try container.decodeIfPresent(Bool.self, forKey: .allowSingleColumnTables) ?? false
self.ocrInlineImages = try container.decodeIfPresent(Bool.self, forKey: .ocrInlineImages) ?? false
}
}
// MARK: - Internal FFI conversions for PdfConfig
internal extension PdfConfig {
init(_ rb: RustBridge.PdfConfigRef) throws {
self.extractImages = rb.extractImages()
self.extractTables = rb.extractTables()
self.passwords = rb.passwords()?.map { $0.as_str().toString() }
self.extractMetadata = rb.extractMetadata()
self.hierarchy = try rb.hierarchy().map { try HierarchyConfig($0) }
self.extractAnnotations = rb.extractAnnotations()
self.topMarginFraction = rb.topMarginFraction()
self.bottomMarginFraction = rb.bottomMarginFraction()
self.allowSingleColumnTables = rb.allowSingleColumnTables()
self.ocrInlineImages = rb.ocrInlineImages()
}
func intoRust() throws -> RustBridge.PdfConfig {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pdfConfigFromJson(json)
}
}
/// Hierarchy extraction configuration for PDF text structure analysis.
///
/// Enables extraction of document hierarchy levels (H1-H6) based on font size
/// clustering and semantic analysis. When enabled, hierarchical blocks are
/// included in page content.
public struct HierarchyConfig: Codable, Sendable, Hashable {
/// Enable hierarchy extraction
public let enabled: Bool
/// Number of font size clusters to use for hierarchy levels (1-7)
///
/// Default: 6, which provides H1-H6 heading levels with body text.
/// Larger values create more fine-grained hierarchy levels.
public let kClusters: UInt
/// Include bounding box information in hierarchy blocks
public let includeBbox: Bool
/// OCR coverage threshold for smart OCR triggering (0.0-1.0)
///
/// Determines when OCR should be triggered based on text block coverage.
/// OCR is triggered when text blocks cover less than this fraction of the page.
/// Default: 0.5 (trigger OCR if less than 50% of page has text)
public let ocrCoverageThreshold: Float?
public init(enabled: Bool, kClusters: UInt, includeBbox: Bool, ocrCoverageThreshold: Float? = nil) {
self.enabled = enabled
self.kClusters = kClusters
self.includeBbox = includeBbox
self.ocrCoverageThreshold = ocrCoverageThreshold
}
private enum CodingKeys: String, CodingKey {
case enabled = "enabled"
case kClusters = "k_clusters"
case includeBbox = "include_bbox"
case ocrCoverageThreshold = "ocr_coverage_threshold"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
self.kClusters = try container.decodeIfPresent(UInt.self, forKey: .kClusters) ?? 3
self.includeBbox = try container.decodeIfPresent(Bool.self, forKey: .includeBbox) ?? true
self.ocrCoverageThreshold = try container.decodeIfPresent(Float.self, forKey: .ocrCoverageThreshold) ?? nil
}
}
// MARK: - Internal FFI conversions for HierarchyConfig
internal extension HierarchyConfig {
init(_ rb: RustBridge.HierarchyConfigRef) throws {
self.enabled = rb.enabled()
self.kClusters = rb.kClusters()
self.includeBbox = rb.includeBbox()
self.ocrCoverageThreshold = rb.ocrCoverageThreshold()
}
func intoRust() throws -> RustBridge.HierarchyConfig {
return RustBridge.HierarchyConfig(self.enabled, self.kClusters, self.includeBbox, self.ocrCoverageThreshold)
}
}
/// Post-processor configuration.
public struct PostProcessorConfig: Codable, Sendable, Hashable {
/// Enable post-processors
public let enabled: Bool
/// Whitelist of processor names to run (None = all enabled)
public let enabledProcessors: [String]?
/// Blacklist of processor names to skip (None = none disabled)
public let disabledProcessors: [String]?
/// Pre-computed AHashSet for O(1) enabled processor lookup
public let enabledSet: [String]?
/// Pre-computed AHashSet for O(1) disabled processor lookup
public let disabledSet: [String]?
public init(enabled: Bool, enabledProcessors: [String]? = nil, disabledProcessors: [String]? = nil, enabledSet: [String]? = nil, disabledSet: [String]? = nil) {
self.enabled = enabled
self.enabledProcessors = enabledProcessors
self.disabledProcessors = disabledProcessors
self.enabledSet = enabledSet
self.disabledSet = disabledSet
}
private enum CodingKeys: String, CodingKey {
case enabled = "enabled"
case enabledProcessors = "enabled_processors"
case disabledProcessors = "disabled_processors"
case enabledSet = "enabled_set"
case disabledSet = "disabled_set"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
self.enabledProcessors = try container.decodeIfPresent([String].self, forKey: .enabledProcessors) ?? nil
self.disabledProcessors = try container.decodeIfPresent([String].self, forKey: .disabledProcessors) ?? nil
self.enabledSet = try container.decodeIfPresent([String].self, forKey: .enabledSet) ?? nil
self.disabledSet = try container.decodeIfPresent([String].self, forKey: .disabledSet) ?? nil
}
}
// MARK: - Internal FFI conversions for PostProcessorConfig
internal extension PostProcessorConfig {
init(_ rb: RustBridge.PostProcessorConfigRef) throws {
self.enabled = rb.enabled()
self.enabledProcessors = rb.enabledProcessors()?.map { $0.as_str().toString() }
self.disabledProcessors = rb.disabledProcessors()?.map { $0.as_str().toString() }
self.enabledSet = rb.enabledSet()?.map { $0.as_str().toString() }
self.disabledSet = rb.disabledSet()?.map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.PostProcessorConfig {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.postProcessorConfigFromJson(json)
}
}
/// Chunking configuration.
///
/// Configures text chunking for document content, including chunk size,
/// overlap, trimming behavior, and optional embeddings.
///
/// Use `..Default::default()` when constructing to allow for future field additions:
/// ```rust
/// let config = ChunkingConfig {
/// max_characters: 500,
/// ..Default::default()
/// };
/// ```
public typealias ChunkingConfig = RustBridge.ChunkingConfig
/// Embedding configuration for text chunks.
///
/// Configures embedding generation using ONNX models via the vendored embedding engine.
/// Requires the `embeddings` feature to be enabled.
public typealias EmbeddingConfig = RustBridge.EmbeddingConfig
/// Configuration for tree-sitter language pack integration.
///
/// Controls grammar download behavior and code analysis options.
///
/// # Example (TOML)
///
/// ```toml
/// [tree_sitter]
/// languages = ["python", "rust"]
/// groups = ["web"]
///
/// [tree_sitter.process]
/// structure = true
/// comments = true
/// docstrings = true
/// ```
public typealias TreeSitterConfig = RustBridge.TreeSitterConfig
/// Processing options for tree-sitter code analysis.
///
/// Controls which analysis features are enabled when extracting code files.
public struct TreeSitterProcessConfig: Codable, Sendable, Hashable {
/// Extract structural items (functions, classes, structs, etc.). Default: true.
public let structure: Bool
/// Extract import statements. Default: true.
public let imports: Bool
/// Extract export statements. Default: true.
public let exports: Bool
/// Extract comments. Default: false.
public let comments: Bool
/// Extract docstrings. Default: false.
public let docstrings: Bool
/// Extract symbol definitions. Default: false.
public let symbols: Bool
/// Include parse diagnostics. Default: false.
public let diagnostics: Bool
/// Maximum chunk size in bytes. `None` disables chunking.
public let chunkMaxSize: UInt?
/// Content rendering mode for code extraction.
public let contentMode: CodeContentMode
public init(structure: Bool, imports: Bool, exports: Bool, comments: Bool, docstrings: Bool, symbols: Bool, diagnostics: Bool, chunkMaxSize: UInt? = nil, contentMode: CodeContentMode) {
self.structure = structure
self.imports = imports
self.exports = exports
self.comments = comments
self.docstrings = docstrings
self.symbols = symbols
self.diagnostics = diagnostics
self.chunkMaxSize = chunkMaxSize
self.contentMode = contentMode
}
private enum CodingKeys: String, CodingKey {
case structure = "structure"
case imports = "imports"
case exports = "exports"
case comments = "comments"
case docstrings = "docstrings"
case symbols = "symbols"
case diagnostics = "diagnostics"
case chunkMaxSize = "chunk_max_size"
case contentMode = "content_mode"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.structure = try container.decodeIfPresent(Bool.self, forKey: .structure) ?? true
self.imports = try container.decodeIfPresent(Bool.self, forKey: .imports) ?? true
self.exports = try container.decodeIfPresent(Bool.self, forKey: .exports) ?? true
self.comments = try container.decodeIfPresent(Bool.self, forKey: .comments) ?? false
self.docstrings = try container.decodeIfPresent(Bool.self, forKey: .docstrings) ?? false
self.symbols = try container.decodeIfPresent(Bool.self, forKey: .symbols) ?? false
self.diagnostics = try container.decodeIfPresent(Bool.self, forKey: .diagnostics) ?? false
self.chunkMaxSize = try container.decodeIfPresent(UInt.self, forKey: .chunkMaxSize) ?? nil
self.contentMode = try container.decode(CodeContentMode.self, forKey: .contentMode)
}
}
// MARK: - Internal FFI conversions for TreeSitterProcessConfig
internal extension TreeSitterProcessConfig {
init(_ rb: RustBridge.TreeSitterProcessConfigRef) throws {
self.structure = rb.structure()
self.imports = rb.imports()
self.exports = rb.exports()
self.comments = rb.comments()
self.docstrings = rb.docstrings()
self.symbols = rb.symbols()
self.diagnostics = rb.diagnostics()
self.chunkMaxSize = rb.chunkMaxSize()
self.contentMode = CodeContentMode(rawValue: rb.contentMode().toString()) ?? { fatalError("Unknown CodeContentMode: \(rb.contentMode().toString())") }()
}
func intoRust() throws -> RustBridge.TreeSitterProcessConfig {
return RustBridge.TreeSitterProcessConfig(self.structure, self.imports, self.exports, self.comments, self.docstrings, self.symbols, self.diagnostics, self.chunkMaxSize, try self.contentMode.intoRust())
}
}
/// A supported document format entry.
///
/// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
public struct SupportedFormat: Codable, Sendable, Hashable {
/// File extension (without leading dot), e.g., "pdf", "docx"
public let `extension`: String
/// MIME type string, e.g., "application/pdf"
public let mimeType: String
public init(`extension`: String, mimeType: String) {
self.`extension` = `extension`
self.mimeType = mimeType
}
private enum CodingKeys: String, CodingKey {
case `extension` = "extension"
case mimeType = "mime_type"
}
}
// MARK: - Internal FFI conversions for SupportedFormat
internal extension SupportedFormat {
init(_ rb: RustBridge.SupportedFormatRef) throws {
self.`extension` = rb.extension_().toString()
self.mimeType = rb.mimeType().toString()
}
func intoRust() throws -> RustBridge.SupportedFormat {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.supportedFormatFromJson(json)
}
}
/// API server configuration.
///
/// This struct holds all configuration options for the Kreuzberg API server,
/// including host/port settings, CORS configuration, and upload limits.
///
/// # Defaults
///
/// - `host`: "127.0.0.1" (localhost only)
/// - `port`: 8000
/// - `cors_origins`: empty vector (allows all origins)
/// - `max_request_body_bytes`: 104_857_600 (100 MB)
/// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
public struct ServerConfig: Codable, Sendable, Hashable {
/// Server host address (e.g., "127.0.0.1", "0.0.0.0")
public let host: String
/// Server port number
public let port: UInt16
/// CORS allowed origins. Empty vector means allow all origins.
///
/// If this is an empty vector, the server will accept requests from any origin.
/// If populated with specific origins (e.g., `"https://example.com"`), only
/// those origins will be allowed.
public let corsOrigins: [String]
/// Maximum size of request body in bytes (default: 100 MB)
public let maxRequestBodyBytes: UInt
/// Maximum size of multipart fields in bytes (default: 100 MB)
public let maxMultipartFieldBytes: UInt
public init(host: String, port: UInt16, corsOrigins: [String], maxRequestBodyBytes: UInt, maxMultipartFieldBytes: UInt) {
self.host = host
self.port = port
self.corsOrigins = corsOrigins
self.maxRequestBodyBytes = maxRequestBodyBytes
self.maxMultipartFieldBytes = maxMultipartFieldBytes
}
private enum CodingKeys: String, CodingKey {
case host = "host"
case port = "port"
case corsOrigins = "cors_origins"
case maxRequestBodyBytes = "max_request_body_bytes"
case maxMultipartFieldBytes = "max_multipart_field_bytes"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.host = try container.decodeIfPresent(String.self, forKey: .host) ?? ""
self.port = try container.decodeIfPresent(UInt16.self, forKey: .port) ?? 0
self.corsOrigins = try container.decodeIfPresent([String].self, forKey: .corsOrigins) ?? []
self.maxRequestBodyBytes = try container.decodeIfPresent(UInt.self, forKey: .maxRequestBodyBytes) ?? 0
self.maxMultipartFieldBytes = try container.decodeIfPresent(UInt.self, forKey: .maxMultipartFieldBytes) ?? 0
}
}
// MARK: - Internal FFI conversions for ServerConfig
internal extension ServerConfig {
init(_ rb: RustBridge.ServerConfigRef) throws {
self.host = rb.host().toString()
self.port = rb.port()
self.corsOrigins = rb.corsOrigins().map { $0.as_str().toString() }
self.maxRequestBodyBytes = rb.maxRequestBodyBytes()
self.maxMultipartFieldBytes = rb.maxMultipartFieldBytes()
}
func intoRust() throws -> RustBridge.ServerConfig {
let __corsOrigins = RustVec<RustString>()
for __elem in self.corsOrigins { __corsOrigins.push(value: RustString(__elem)) }
return RustBridge.ServerConfig(RustString(self.host), self.port, __corsOrigins, self.maxRequestBodyBytes, self.maxMultipartFieldBytes)
}
}
public typealias StructuredDataResult = RustBridge.StructuredDataResult
/// Application properties from docProps/app.xml for DOCX
///
/// Contains Word-specific document statistics and metadata.
public struct DocxAppProperties: Codable, Sendable, Hashable {
/// Application name (e.g., "Microsoft Office Word")
public let application: String?
/// Application version
public let appVersion: String?
/// Template filename
public let template: String?
/// Total editing time in minutes
public let totalTime: Int32?
/// Number of pages
public let pages: Int32?
/// Number of words
public let words: Int32?
/// Number of characters (excluding spaces)
public let characters: Int32?
/// Number of characters (including spaces)
public let charactersWithSpaces: Int32?
/// Number of lines
public let lines: Int32?
/// Number of paragraphs
public let paragraphs: Int32?
/// Company name
public let company: String?
/// Document security level
public let docSecurity: Int32?
/// Scale crop flag
public let scaleCrop: Bool?
/// Links up to date flag
public let linksUpToDate: Bool?
/// Shared document flag
public let sharedDoc: Bool?
/// Hyperlinks changed flag
public let hyperlinksChanged: Bool?
public init(application: String? = nil, appVersion: String? = nil, template: String? = nil, totalTime: Int32? = nil, pages: Int32? = nil, words: Int32? = nil, characters: Int32? = nil, charactersWithSpaces: Int32? = nil, lines: Int32? = nil, paragraphs: Int32? = nil, company: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil) {
self.application = application
self.appVersion = appVersion
self.template = template
self.totalTime = totalTime
self.pages = pages
self.words = words
self.characters = characters
self.charactersWithSpaces = charactersWithSpaces
self.lines = lines
self.paragraphs = paragraphs
self.company = company
self.docSecurity = docSecurity
self.scaleCrop = scaleCrop
self.linksUpToDate = linksUpToDate
self.sharedDoc = sharedDoc
self.hyperlinksChanged = hyperlinksChanged
}
private enum CodingKeys: String, CodingKey {
case application = "application"
case appVersion = "app_version"
case template = "template"
case totalTime = "total_time"
case pages = "pages"
case words = "words"
case characters = "characters"
case charactersWithSpaces = "characters_with_spaces"
case lines = "lines"
case paragraphs = "paragraphs"
case company = "company"
case docSecurity = "doc_security"
case scaleCrop = "scale_crop"
case linksUpToDate = "links_up_to_date"
case sharedDoc = "shared_doc"
case hyperlinksChanged = "hyperlinks_changed"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
self.template = try container.decodeIfPresent(String.self, forKey: .template) ?? nil
self.totalTime = try container.decodeIfPresent(Int32.self, forKey: .totalTime) ?? nil
self.pages = try container.decodeIfPresent(Int32.self, forKey: .pages) ?? nil
self.words = try container.decodeIfPresent(Int32.self, forKey: .words) ?? nil
self.characters = try container.decodeIfPresent(Int32.self, forKey: .characters) ?? nil
self.charactersWithSpaces = try container.decodeIfPresent(Int32.self, forKey: .charactersWithSpaces) ?? nil
self.lines = try container.decodeIfPresent(Int32.self, forKey: .lines) ?? nil
self.paragraphs = try container.decodeIfPresent(Int32.self, forKey: .paragraphs) ?? nil
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
}
}
// MARK: - Internal FFI conversions for DocxAppProperties
internal extension DocxAppProperties {
init(_ rb: RustBridge.DocxAppPropertiesRef) throws {
self.application = rb.application()?.toString()
self.appVersion = rb.appVersion()?.toString()
self.template = rb.template()?.toString()
self.totalTime = rb.totalTime()
self.pages = rb.pages()
self.words = rb.words()
self.characters = rb.characters()
self.charactersWithSpaces = rb.charactersWithSpaces()
self.lines = rb.lines()
self.paragraphs = rb.paragraphs()
self.company = rb.company()?.toString()
self.docSecurity = rb.docSecurity()
self.scaleCrop = rb.scaleCrop()
self.linksUpToDate = rb.linksUpToDate()
self.sharedDoc = rb.sharedDoc()
self.hyperlinksChanged = rb.hyperlinksChanged()
}
func intoRust() throws -> RustBridge.DocxAppProperties {
return RustBridge.DocxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.template.map(RustString.init), self.totalTime, self.pages, self.words, self.characters, self.charactersWithSpaces, self.lines, self.paragraphs, self.company.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged)
}
}
/// Application properties from docProps/app.xml for XLSX
///
/// Contains Excel-specific document metadata.
public struct XlsxAppProperties: Codable, Sendable, Hashable {
/// Application name (e.g., "Microsoft Excel")
public let application: String?
/// Application version
public let appVersion: String?
/// Document security level
public let docSecurity: Int32?
/// Scale crop flag
public let scaleCrop: Bool?
/// Links up to date flag
public let linksUpToDate: Bool?
/// Shared document flag
public let sharedDoc: Bool?
/// Hyperlinks changed flag
public let hyperlinksChanged: Bool?
/// Company name
public let company: String?
/// Worksheet names
public let worksheetNames: [String]
public init(application: String? = nil, appVersion: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil, company: String? = nil, worksheetNames: [String]) {
self.application = application
self.appVersion = appVersion
self.docSecurity = docSecurity
self.scaleCrop = scaleCrop
self.linksUpToDate = linksUpToDate
self.sharedDoc = sharedDoc
self.hyperlinksChanged = hyperlinksChanged
self.company = company
self.worksheetNames = worksheetNames
}
private enum CodingKeys: String, CodingKey {
case application = "application"
case appVersion = "app_version"
case docSecurity = "doc_security"
case scaleCrop = "scale_crop"
case linksUpToDate = "links_up_to_date"
case sharedDoc = "shared_doc"
case hyperlinksChanged = "hyperlinks_changed"
case company = "company"
case worksheetNames = "worksheet_names"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
self.worksheetNames = try container.decodeIfPresent([String].self, forKey: .worksheetNames) ?? []
}
}
// MARK: - Internal FFI conversions for XlsxAppProperties
internal extension XlsxAppProperties {
init(_ rb: RustBridge.XlsxAppPropertiesRef) throws {
self.application = rb.application()?.toString()
self.appVersion = rb.appVersion()?.toString()
self.docSecurity = rb.docSecurity()
self.scaleCrop = rb.scaleCrop()
self.linksUpToDate = rb.linksUpToDate()
self.sharedDoc = rb.sharedDoc()
self.hyperlinksChanged = rb.hyperlinksChanged()
self.company = rb.company()?.toString()
self.worksheetNames = rb.worksheetNames().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.XlsxAppProperties {
let __worksheetNames = RustVec<RustString>()
for __elem in self.worksheetNames { __worksheetNames.push(value: RustString(__elem)) }
return RustBridge.XlsxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged, self.company.map(RustString.init), __worksheetNames)
}
}
/// Application properties from docProps/app.xml for PPTX
///
/// Contains PowerPoint-specific document metadata.
public struct PptxAppProperties: Codable, Sendable, Hashable {
/// Application name (e.g., "Microsoft Office PowerPoint")
public let application: String?
/// Application version
public let appVersion: String?
/// Total editing time in minutes
public let totalTime: Int32?
/// Company name
public let company: String?
/// Document security level
public let docSecurity: Int32?
/// Scale crop flag
public let scaleCrop: Bool?
/// Links up to date flag
public let linksUpToDate: Bool?
/// Shared document flag
public let sharedDoc: Bool?
/// Hyperlinks changed flag
public let hyperlinksChanged: Bool?
/// Number of slides
public let slides: Int32?
/// Number of notes
public let notes: Int32?
/// Number of hidden slides
public let hiddenSlides: Int32?
/// Number of multimedia clips
public let multimediaClips: Int32?
/// Presentation format (e.g., "Widescreen", "Standard")
public let presentationFormat: String?
/// Slide titles
public let slideTitles: [String]
public init(application: String? = nil, appVersion: String? = nil, totalTime: Int32? = nil, company: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil, slides: Int32? = nil, notes: Int32? = nil, hiddenSlides: Int32? = nil, multimediaClips: Int32? = nil, presentationFormat: String? = nil, slideTitles: [String]) {
self.application = application
self.appVersion = appVersion
self.totalTime = totalTime
self.company = company
self.docSecurity = docSecurity
self.scaleCrop = scaleCrop
self.linksUpToDate = linksUpToDate
self.sharedDoc = sharedDoc
self.hyperlinksChanged = hyperlinksChanged
self.slides = slides
self.notes = notes
self.hiddenSlides = hiddenSlides
self.multimediaClips = multimediaClips
self.presentationFormat = presentationFormat
self.slideTitles = slideTitles
}
private enum CodingKeys: String, CodingKey {
case application = "application"
case appVersion = "app_version"
case totalTime = "total_time"
case company = "company"
case docSecurity = "doc_security"
case scaleCrop = "scale_crop"
case linksUpToDate = "links_up_to_date"
case sharedDoc = "shared_doc"
case hyperlinksChanged = "hyperlinks_changed"
case slides = "slides"
case notes = "notes"
case hiddenSlides = "hidden_slides"
case multimediaClips = "multimedia_clips"
case presentationFormat = "presentation_format"
case slideTitles = "slide_titles"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
self.totalTime = try container.decodeIfPresent(Int32.self, forKey: .totalTime) ?? nil
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
self.slides = try container.decodeIfPresent(Int32.self, forKey: .slides) ?? nil
self.notes = try container.decodeIfPresent(Int32.self, forKey: .notes) ?? nil
self.hiddenSlides = try container.decodeIfPresent(Int32.self, forKey: .hiddenSlides) ?? nil
self.multimediaClips = try container.decodeIfPresent(Int32.self, forKey: .multimediaClips) ?? nil
self.presentationFormat = try container.decodeIfPresent(String.self, forKey: .presentationFormat) ?? nil
self.slideTitles = try container.decodeIfPresent([String].self, forKey: .slideTitles) ?? []
}
}
// MARK: - Internal FFI conversions for PptxAppProperties
internal extension PptxAppProperties {
init(_ rb: RustBridge.PptxAppPropertiesRef) throws {
self.application = rb.application()?.toString()
self.appVersion = rb.appVersion()?.toString()
self.totalTime = rb.totalTime()
self.company = rb.company()?.toString()
self.docSecurity = rb.docSecurity()
self.scaleCrop = rb.scaleCrop()
self.linksUpToDate = rb.linksUpToDate()
self.sharedDoc = rb.sharedDoc()
self.hyperlinksChanged = rb.hyperlinksChanged()
self.slides = rb.slides()
self.notes = rb.notes()
self.hiddenSlides = rb.hiddenSlides()
self.multimediaClips = rb.multimediaClips()
self.presentationFormat = rb.presentationFormat()?.toString()
self.slideTitles = rb.slideTitles().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.PptxAppProperties {
let __slideTitles = RustVec<RustString>()
for __elem in self.slideTitles { __slideTitles.push(value: RustString(__elem)) }
return RustBridge.PptxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.totalTime, self.company.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged, self.slides, self.notes, self.hiddenSlides, self.multimediaClips, self.presentationFormat.map(RustString.init), __slideTitles)
}
}
/// Dublin Core metadata from docProps/core.xml
///
/// Contains standard metadata fields defined by the Dublin Core standard
/// and Office-specific extensions.
public struct CoreProperties: Codable, Sendable, Hashable {
/// Document title
public let title: String?
/// Document subject/topic
public let subject: String?
/// Document creator/author
public let creator: String?
/// Keywords or tags
public let keywords: String?
/// Document description/abstract
public let description: String?
/// User who last modified the document
public let lastModifiedBy: String?
/// Revision number
public let revision: String?
/// Creation timestamp (ISO 8601)
public let created: String?
/// Last modification timestamp (ISO 8601)
public let modified: String?
/// Document category
public let category: String?
/// Content status (Draft, Final, etc.)
public let contentStatus: String?
/// Document language
public let language: String?
/// Unique identifier
public let identifier: String?
/// Document version
public let version: String?
/// Last print timestamp (ISO 8601)
public let lastPrinted: String?
public init(title: String? = nil, subject: String? = nil, creator: String? = nil, keywords: String? = nil, description: String? = nil, lastModifiedBy: String? = nil, revision: String? = nil, created: String? = nil, modified: String? = nil, category: String? = nil, contentStatus: String? = nil, language: String? = nil, identifier: String? = nil, version: String? = nil, lastPrinted: String? = nil) {
self.title = title
self.subject = subject
self.creator = creator
self.keywords = keywords
self.description = description
self.lastModifiedBy = lastModifiedBy
self.revision = revision
self.created = created
self.modified = modified
self.category = category
self.contentStatus = contentStatus
self.language = language
self.identifier = identifier
self.version = version
self.lastPrinted = lastPrinted
}
private enum CodingKeys: String, CodingKey {
case title = "title"
case subject = "subject"
case creator = "creator"
case keywords = "keywords"
case description = "description"
case lastModifiedBy = "last_modified_by"
case revision = "revision"
case created = "created"
case modified = "modified"
case category = "category"
case contentStatus = "content_status"
case language = "language"
case identifier = "identifier"
case version = "version"
case lastPrinted = "last_printed"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.title = try container.decodeIfPresent(String.self, forKey: .title) ?? nil
self.subject = try container.decodeIfPresent(String.self, forKey: .subject) ?? nil
self.creator = try container.decodeIfPresent(String.self, forKey: .creator) ?? nil
self.keywords = try container.decodeIfPresent(String.self, forKey: .keywords) ?? nil
self.description = try container.decodeIfPresent(String.self, forKey: .description) ?? nil
self.lastModifiedBy = try container.decodeIfPresent(String.self, forKey: .lastModifiedBy) ?? nil
self.revision = try container.decodeIfPresent(String.self, forKey: .revision) ?? nil
self.created = try container.decodeIfPresent(String.self, forKey: .created) ?? nil
self.modified = try container.decodeIfPresent(String.self, forKey: .modified) ?? nil
self.category = try container.decodeIfPresent(String.self, forKey: .category) ?? nil
self.contentStatus = try container.decodeIfPresent(String.self, forKey: .contentStatus) ?? nil
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? nil
self.identifier = try container.decodeIfPresent(String.self, forKey: .identifier) ?? nil
self.version = try container.decodeIfPresent(String.self, forKey: .version) ?? nil
self.lastPrinted = try container.decodeIfPresent(String.self, forKey: .lastPrinted) ?? nil
}
}
// MARK: - Internal FFI conversions for CoreProperties
internal extension CoreProperties {
init(_ rb: RustBridge.CorePropertiesRef) throws {
self.title = rb.title()?.toString()
self.subject = rb.subject()?.toString()
self.creator = rb.creator()?.toString()
self.keywords = rb.keywords()?.toString()
self.description = rb.description()?.toString()
self.lastModifiedBy = rb.lastModifiedBy()?.toString()
self.revision = rb.revision()?.toString()
self.created = rb.created()?.toString()
self.modified = rb.modified()?.toString()
self.category = rb.category()?.toString()
self.contentStatus = rb.contentStatus()?.toString()
self.language = rb.language()?.toString()
self.identifier = rb.identifier()?.toString()
self.version = rb.version()?.toString()
self.lastPrinted = rb.lastPrinted()?.toString()
}
func intoRust() throws -> RustBridge.CoreProperties {
return RustBridge.CoreProperties(self.title.map(RustString.init), self.subject.map(RustString.init), self.creator.map(RustString.init), self.keywords.map(RustString.init), self.description.map(RustString.init), self.lastModifiedBy.map(RustString.init), self.revision.map(RustString.init), self.created.map(RustString.init), self.modified.map(RustString.init), self.category.map(RustString.init), self.contentStatus.map(RustString.init), self.language.map(RustString.init), self.identifier.map(RustString.init), self.version.map(RustString.init), self.lastPrinted.map(RustString.init))
}
}
/// Configuration for security limits across extractors.
///
/// All limits are intentionally conservative to prevent DoS attacks
/// while still supporting legitimate documents.
public struct SecurityLimits: Codable, Sendable, Hashable {
/// Maximum uncompressed size for archives (500 MB)
public let maxArchiveSize: UInt
/// Maximum compression ratio before flagging as potential bomb (100:1)
public let maxCompressionRatio: UInt
/// Maximum number of files in archive (10,000)
public let maxFilesInArchive: UInt
/// Maximum nesting depth for structures (100)
public let maxNestingDepth: UInt
/// Maximum length of any single XML entity / attribute / token (1 MiB).
/// This is a per-token cap, NOT a total cap billion-laughs class
/// attacks where a single entity expands to hundreds of MB are caught
/// here, while normal long text content (a paragraph, a CDATA block) is
/// caught by `max_content_size` instead.
public let maxEntityLength: UInt
/// Maximum string growth per document (100 MB)
public let maxContentSize: UInt
/// Maximum iterations per operation
public let maxIterations: UInt
/// Maximum XML depth (100 levels)
public let maxXmlDepth: UInt
/// Maximum cells per table (100,000)
public let maxTableCells: UInt
public init(maxArchiveSize: UInt, maxCompressionRatio: UInt, maxFilesInArchive: UInt, maxNestingDepth: UInt, maxEntityLength: UInt, maxContentSize: UInt, maxIterations: UInt, maxXmlDepth: UInt, maxTableCells: UInt) {
self.maxArchiveSize = maxArchiveSize
self.maxCompressionRatio = maxCompressionRatio
self.maxFilesInArchive = maxFilesInArchive
self.maxNestingDepth = maxNestingDepth
self.maxEntityLength = maxEntityLength
self.maxContentSize = maxContentSize
self.maxIterations = maxIterations
self.maxXmlDepth = maxXmlDepth
self.maxTableCells = maxTableCells
}
private enum CodingKeys: String, CodingKey {
case maxArchiveSize = "max_archive_size"
case maxCompressionRatio = "max_compression_ratio"
case maxFilesInArchive = "max_files_in_archive"
case maxNestingDepth = "max_nesting_depth"
case maxEntityLength = "max_entity_length"
case maxContentSize = "max_content_size"
case maxIterations = "max_iterations"
case maxXmlDepth = "max_xml_depth"
case maxTableCells = "max_table_cells"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.maxArchiveSize = try container.decodeIfPresent(UInt.self, forKey: .maxArchiveSize) ?? 524288000
self.maxCompressionRatio = try container.decodeIfPresent(UInt.self, forKey: .maxCompressionRatio) ?? 100
self.maxFilesInArchive = try container.decodeIfPresent(UInt.self, forKey: .maxFilesInArchive) ?? 10000
self.maxNestingDepth = try container.decodeIfPresent(UInt.self, forKey: .maxNestingDepth) ?? 1024
self.maxEntityLength = try container.decodeIfPresent(UInt.self, forKey: .maxEntityLength) ?? 1048576
self.maxContentSize = try container.decodeIfPresent(UInt.self, forKey: .maxContentSize) ?? 104857600
self.maxIterations = try container.decodeIfPresent(UInt.self, forKey: .maxIterations) ?? 10000000
self.maxXmlDepth = try container.decodeIfPresent(UInt.self, forKey: .maxXmlDepth) ?? 1024
self.maxTableCells = try container.decodeIfPresent(UInt.self, forKey: .maxTableCells) ?? 100000
}
}
// MARK: - Internal FFI conversions for SecurityLimits
internal extension SecurityLimits {
init(_ rb: RustBridge.SecurityLimitsRef) throws {
self.maxArchiveSize = rb.maxArchiveSize()
self.maxCompressionRatio = rb.maxCompressionRatio()
self.maxFilesInArchive = rb.maxFilesInArchive()
self.maxNestingDepth = rb.maxNestingDepth()
self.maxEntityLength = rb.maxEntityLength()
self.maxContentSize = rb.maxContentSize()
self.maxIterations = rb.maxIterations()
self.maxXmlDepth = rb.maxXmlDepth()
self.maxTableCells = rb.maxTableCells()
}
func intoRust() throws -> RustBridge.SecurityLimits {
return RustBridge.SecurityLimits(self.maxArchiveSize, self.maxCompressionRatio, self.maxFilesInArchive, self.maxNestingDepth, self.maxEntityLength, self.maxContentSize, self.maxIterations, self.maxXmlDepth, self.maxTableCells)
}
}
public typealias TokenReductionConfig = RustBridge.TokenReductionConfig
/// A PDF annotation extracted from a document page.
public struct PdfAnnotation: Codable, Sendable, Hashable {
/// The type of annotation.
public let annotationType: PdfAnnotationType
/// Text content of the annotation (e.g., comment text, link URL).
public let content: String?
/// Page number where the annotation appears (1-indexed).
public let pageNumber: UInt32
/// Bounding box of the annotation on the page.
public let boundingBox: BoundingBox?
public init(annotationType: PdfAnnotationType, content: String? = nil, pageNumber: UInt32, boundingBox: BoundingBox? = nil) {
self.annotationType = annotationType
self.content = content
self.pageNumber = pageNumber
self.boundingBox = boundingBox
}
private enum CodingKeys: String, CodingKey {
case annotationType = "annotation_type"
case content = "content"
case pageNumber = "page_number"
case boundingBox = "bounding_box"
}
}
// MARK: - Internal FFI conversions for PdfAnnotation
internal extension PdfAnnotation {
init(_ rb: RustBridge.PdfAnnotationRef) throws {
self.annotationType = PdfAnnotationType(rawValue: rb.annotationType().toString()) ?? { fatalError("Unknown PdfAnnotationType: \(rb.annotationType().toString())") }()
self.content = rb.content()?.toString()
self.pageNumber = rb.pageNumber()
self.boundingBox = try rb.boundingBox().map { try BoundingBox($0) }
}
func intoRust() throws -> RustBridge.PdfAnnotation {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pdfAnnotationFromJson(json)
}
}
/// Comprehensive Djot document structure with semantic preservation.
///
/// This type captures the full richness of Djot markup, including:
/// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
/// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
/// - Attributes (classes, IDs, key-value pairs)
/// - Links, images, footnotes
/// - Math expressions (inline and display)
/// - Tables with full structure
///
/// Available when the `djot` feature is enabled.
public typealias DjotContent = RustBridge.DjotContent
/// Block-level element in a Djot document.
///
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
public typealias FormattedBlock = RustBridge.FormattedBlock
/// Inline element within a block.
///
/// Represents text with formatting, links, images, etc.
public typealias InlineElement = RustBridge.InlineElement
/// Image element in Djot.
public struct DjotImage: Codable, Sendable, Hashable {
/// Image source URL or path
public let src: String
/// Alternative text
public let alt: String
/// Optional title
public let title: String?
/// Element attributes
public let attributes: String?
public init(src: String, alt: String, title: String? = nil, attributes: String? = nil) {
self.src = src
self.alt = alt
self.title = title
self.attributes = attributes
}
}
// MARK: - Internal FFI conversions for DjotImage
internal extension DjotImage {
init(_ rb: RustBridge.DjotImageRef) throws {
self.src = rb.src().toString()
self.alt = rb.alt().toString()
self.title = rb.title()?.toString()
self.attributes = rb.attributes()?.toString()
}
func intoRust() throws -> RustBridge.DjotImage {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.djotImageFromJson(json)
}
}
/// Link element in Djot.
public struct DjotLink: Codable, Sendable, Hashable {
/// Link URL
public let url: String
/// Link text content
public let text: String
/// Optional title
public let title: String?
/// Element attributes
public let attributes: String?
public init(url: String, text: String, title: String? = nil, attributes: String? = nil) {
self.url = url
self.text = text
self.title = title
self.attributes = attributes
}
}
// MARK: - Internal FFI conversions for DjotLink
internal extension DjotLink {
init(_ rb: RustBridge.DjotLinkRef) throws {
self.url = rb.url().toString()
self.text = rb.text().toString()
self.title = rb.title()?.toString()
self.attributes = rb.attributes()?.toString()
}
func intoRust() throws -> RustBridge.DjotLink {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.djotLinkFromJson(json)
}
}
/// Footnote in Djot.
public typealias Footnote = RustBridge.Footnote
/// Top-level structured document representation.
///
/// A flat array of nodes with index-based parent/child references forming a tree.
/// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
/// to iterate over top-level content by layer.
///
/// # Validation
///
/// Call `validate()` after construction to verify all node indices are in bounds
/// and parent-child relationships are bidirectionally consistent.
public typealias DocumentStructure = RustBridge.DocumentStructure
/// A resolved relationship between two nodes in the document tree.
public struct DocumentRelationship: Codable, Sendable, Hashable {
/// Source node index (the referencing node).
public let source: UInt32
/// Target node index (the referenced node).
public let target: UInt32
/// Semantic kind of the relationship.
public let kind: RelationshipKind
public init(source: UInt32, target: UInt32, kind: RelationshipKind) {
self.source = source
self.target = target
self.kind = kind
}
}
// MARK: - Internal FFI conversions for DocumentRelationship
internal extension DocumentRelationship {
init(_ rb: RustBridge.DocumentRelationshipRef) throws {
self.source = rb.source()
self.target = rb.target()
self.kind = RelationshipKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown RelationshipKind: \(rb.kind().toString())") }()
}
func intoRust() throws -> RustBridge.DocumentRelationship {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.documentRelationshipFromJson(json)
}
}
/// A single node in the document tree.
///
/// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
/// for tree structure, and metadata like page number, bounding box, and content layer.
public typealias DocumentNode = RustBridge.DocumentNode
/// Structured table grid with cell-level metadata.
///
/// Stores row/column dimensions and a flat list of cells with position info.
public struct TableGrid: Codable, Sendable, Hashable {
/// Number of rows in the table.
public let rows: UInt32
/// Number of columns in the table.
public let cols: UInt32
/// All cells in row-major order.
public let cells: [GridCell]
public init(rows: UInt32, cols: UInt32, cells: [GridCell]) {
self.rows = rows
self.cols = cols
self.cells = cells
}
private enum CodingKeys: String, CodingKey {
case rows = "rows"
case cols = "cols"
case cells = "cells"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.rows = try container.decodeIfPresent(UInt32.self, forKey: .rows) ?? 0
self.cols = try container.decodeIfPresent(UInt32.self, forKey: .cols) ?? 0
self.cells = try container.decodeIfPresent([GridCell].self, forKey: .cells) ?? []
}
}
// MARK: - Internal FFI conversions for TableGrid
internal extension TableGrid {
init(_ rb: RustBridge.TableGridRef) throws {
self.rows = rb.rows()
self.cols = rb.cols()
self.cells = try rb.cells().map { try GridCell($0) }
}
func intoRust() throws -> RustBridge.TableGrid {
let __cells = RustVec<RustBridge.GridCell>()
for __elem in self.cells { __cells.push(value: try __elem.intoRust()) }
return RustBridge.TableGrid(self.rows, self.cols, __cells)
}
}
/// Individual grid cell with position and span metadata.
public struct GridCell: Codable, Sendable, Hashable {
/// Cell text content.
public let content: String
/// Zero-indexed row position.
public let row: UInt32
/// Zero-indexed column position.
public let col: UInt32
/// Number of rows this cell spans.
public let rowSpan: UInt32
/// Number of columns this cell spans.
public let colSpan: UInt32
/// Whether this is a header cell.
public let isHeader: Bool
/// Bounding box for this cell (if available).
public let bbox: BoundingBox?
public init(content: String, row: UInt32, col: UInt32, rowSpan: UInt32, colSpan: UInt32, isHeader: Bool, bbox: BoundingBox? = nil) {
self.content = content
self.row = row
self.col = col
self.rowSpan = rowSpan
self.colSpan = colSpan
self.isHeader = isHeader
self.bbox = bbox
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case row = "row"
case col = "col"
case rowSpan = "row_span"
case colSpan = "col_span"
case isHeader = "is_header"
case bbox = "bbox"
}
}
// MARK: - Internal FFI conversions for GridCell
internal extension GridCell {
init(_ rb: RustBridge.GridCellRef) throws {
self.content = rb.content().toString()
self.row = rb.row()
self.col = rb.col()
self.rowSpan = rb.rowSpan()
self.colSpan = rb.colSpan()
self.isHeader = rb.isHeader()
self.bbox = try rb.bbox().map { try BoundingBox($0) }
}
func intoRust() throws -> RustBridge.GridCell {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.gridCellFromJson(json)
}
}
/// Inline text annotation byte-range based formatting and links.
///
/// Annotations reference byte offsets into the node's text content,
/// enabling precise identification of formatted regions.
public struct TextAnnotation: Codable, Sendable, Hashable {
/// Start byte offset in the node's text content (inclusive).
public let start: UInt32
/// End byte offset in the node's text content (exclusive).
public let end: UInt32
/// Annotation type.
public let kind: AnnotationKind
public init(start: UInt32, end: UInt32, kind: AnnotationKind) {
self.start = start
self.end = end
self.kind = kind
}
}
// MARK: - Internal FFI conversions for TextAnnotation
internal extension TextAnnotation {
init(_ rb: RustBridge.TextAnnotationRef) throws {
self.start = rb.start()
self.end = rb.end()
self.kind = try JSONDecoder().decode(AnnotationKind.self, from: ((rb.kind().toString()).data(using: .utf8) ?? Data("null".utf8)))
}
func intoRust() throws -> RustBridge.TextAnnotation {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.textAnnotationFromJson(json)
}
}
/// General extraction result used by the core extraction API.
///
/// This is the main result type returned by all extraction functions.
public typealias ExtractionResult = RustBridge.ExtractionResult
/// A single file extracted from an archive.
///
/// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
/// enabled, each processable file produces its own full `ExtractionResult`.
public typealias ArchiveEntry = RustBridge.ArchiveEntry
/// A non-fatal warning from a processing pipeline stage.
///
/// Captures errors from optional features that don't prevent extraction
/// but may indicate degraded results.
public struct ProcessingWarning: Codable, Sendable, Hashable {
/// The pipeline stage or feature that produced this warning
/// (e.g., "embedding", "chunking", "language_detection", "output_format").
public let source: String
/// Human-readable description of what went wrong.
public let message: String
public init(source: String, message: String) {
self.source = source
self.message = message
}
}
// MARK: - Internal FFI conversions for ProcessingWarning
internal extension ProcessingWarning {
init(_ rb: RustBridge.ProcessingWarningRef) throws {
self.source = rb.source().toString()
self.message = rb.message().toString()
}
func intoRust() throws -> RustBridge.ProcessingWarning {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.processingWarningFromJson(json)
}
}
/// Token usage and cost data for a single LLM call made during extraction.
///
/// Populated when VLM OCR, structured extraction, or LLM-based embeddings
/// are used. Multiple entries may be present when multiple LLM calls occur
/// within one extraction (e.g. VLM OCR + structured extraction).
public struct LlmUsage: Codable, Sendable, Hashable {
/// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
public let model: String
/// The pipeline stage that triggered this LLM call
/// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
public let source: String
/// Number of input/prompt tokens consumed.
public let inputTokens: UInt64?
/// Number of output/completion tokens generated.
public let outputTokens: UInt64?
/// Total tokens (input + output).
public let totalTokens: UInt64?
/// Estimated cost in USD based on the provider's published pricing.
public let estimatedCost: Double?
/// Why the model stopped generating (e.g. "stop", "length", "content_filter").
public let finishReason: String?
public init(model: String, source: String, inputTokens: UInt64? = nil, outputTokens: UInt64? = nil, totalTokens: UInt64? = nil, estimatedCost: Double? = nil, finishReason: String? = nil) {
self.model = model
self.source = source
self.inputTokens = inputTokens
self.outputTokens = outputTokens
self.totalTokens = totalTokens
self.estimatedCost = estimatedCost
self.finishReason = finishReason
}
private enum CodingKeys: String, CodingKey {
case model = "model"
case source = "source"
case inputTokens = "input_tokens"
case outputTokens = "output_tokens"
case totalTokens = "total_tokens"
case estimatedCost = "estimated_cost"
case finishReason = "finish_reason"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.model = try container.decodeIfPresent(String.self, forKey: .model) ?? ""
self.source = try container.decodeIfPresent(String.self, forKey: .source) ?? ""
self.inputTokens = try container.decodeIfPresent(UInt64.self, forKey: .inputTokens) ?? nil
self.outputTokens = try container.decodeIfPresent(UInt64.self, forKey: .outputTokens) ?? nil
self.totalTokens = try container.decodeIfPresent(UInt64.self, forKey: .totalTokens) ?? nil
self.estimatedCost = try container.decodeIfPresent(Double.self, forKey: .estimatedCost) ?? nil
self.finishReason = try container.decodeIfPresent(String.self, forKey: .finishReason) ?? nil
}
}
// MARK: - Internal FFI conversions for LlmUsage
internal extension LlmUsage {
init(_ rb: RustBridge.LlmUsageRef) throws {
self.model = rb.model().toString()
self.source = rb.source().toString()
self.inputTokens = rb.inputTokens()
self.outputTokens = rb.outputTokens()
self.totalTokens = rb.totalTokens()
self.estimatedCost = rb.estimatedCost()
self.finishReason = rb.finishReason()?.toString()
}
func intoRust() throws -> RustBridge.LlmUsage {
return RustBridge.LlmUsage(RustString(self.model), RustString(self.source), self.inputTokens, self.outputTokens, self.totalTokens, self.estimatedCost, self.finishReason.map(RustString.init))
}
}
/// A text chunk with optional embedding and metadata.
///
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
/// contains the text content, optional embedding vector (if embedding generation
/// is configured), and metadata about its position in the document.
public struct Chunk: Codable, Sendable, Hashable {
/// The text content of this chunk.
public let content: String
/// Semantic structural classification of this chunk.
///
/// Assigned by the heuristic classifier based on content patterns and
/// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
public let chunkType: ChunkType
/// Optional embedding vector for this chunk.
///
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
/// The dimensionality depends on the chosen embedding model.
public let embedding: [Float]?
/// Metadata about this chunk's position and properties.
public let metadata: ChunkMetadata
public init(content: String, chunkType: ChunkType, embedding: [Float]? = nil, metadata: ChunkMetadata) {
self.content = content
self.chunkType = chunkType
self.embedding = embedding
self.metadata = metadata
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case chunkType = "chunk_type"
case embedding = "embedding"
case metadata = "metadata"
}
}
// MARK: - Internal FFI conversions for Chunk
internal extension Chunk {
init(_ rb: RustBridge.ChunkRef) throws {
self.content = rb.content().toString()
self.chunkType = ChunkType(rawValue: rb.chunkType().toString()) ?? { fatalError("Unknown ChunkType: \(rb.chunkType().toString())") }()
self.embedding = rb.embedding().map { Array($0) }
self.metadata = try ChunkMetadata(rb.metadata())
}
func intoRust() throws -> RustBridge.Chunk {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.chunkFromJson(json)
}
}
/// Heading context for a chunk within a Markdown document.
///
/// Contains the heading hierarchy from document root to this chunk's section.
public struct HeadingContext: Codable, Sendable, Hashable {
/// The heading hierarchy from document root to this chunk's section.
/// Index 0 is the outermost (h1), last element is the most specific.
public let headings: [HeadingLevel]
public init(headings: [HeadingLevel]) {
self.headings = headings
}
}
// MARK: - Internal FFI conversions for HeadingContext
internal extension HeadingContext {
init(_ rb: RustBridge.HeadingContextRef) throws {
self.headings = try rb.headings().map { try HeadingLevel($0) }
}
func intoRust() throws -> RustBridge.HeadingContext {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.headingContextFromJson(json)
}
}
/// A single heading in the hierarchy.
public struct HeadingLevel: Codable, Sendable, Hashable {
/// Heading depth (1 = h1, 2 = h2, etc.)
public let level: UInt8
/// The text content of the heading.
public let text: String
public init(level: UInt8, text: String) {
self.level = level
self.text = text
}
}
// MARK: - Internal FFI conversions for HeadingLevel
internal extension HeadingLevel {
init(_ rb: RustBridge.HeadingLevelRef) throws {
self.level = rb.level()
self.text = rb.text().toString()
}
func intoRust() throws -> RustBridge.HeadingLevel {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.headingLevelFromJson(json)
}
}
/// Metadata about a chunk's position in the original document.
public struct ChunkMetadata: Codable, Sendable, Hashable {
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
public let byteStart: UInt
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
public let byteEnd: UInt
/// Number of tokens in this chunk (if available).
///
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
public let tokenCount: UInt?
/// Zero-based index of this chunk in the document.
public let chunkIndex: UInt
/// Total number of chunks in the document.
public let totalChunks: UInt
/// First page number this chunk spans (1-indexed).
///
/// Only populated when page tracking is enabled in extraction configuration.
public let firstPage: UInt32?
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
///
/// Only populated when page tracking is enabled in extraction configuration.
public let lastPage: UInt32?
/// Heading context when using Markdown chunker.
///
/// Contains the heading hierarchy this chunk falls under.
/// Only populated when `ChunkerType::Markdown` is used.
public let headingContext: HeadingContext?
/// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
///
/// Contains zero-based indices into the top-level `images` collection for every
/// image whose `page_number` falls within `[first_page, last_page]`.
/// Empty when image extraction is disabled or the chunk spans no pages with images.
public let imageIndices: [UInt32]
public init(byteStart: UInt, byteEnd: UInt, tokenCount: UInt? = nil, chunkIndex: UInt, totalChunks: UInt, firstPage: UInt32? = nil, lastPage: UInt32? = nil, headingContext: HeadingContext? = nil, imageIndices: [UInt32]) {
self.byteStart = byteStart
self.byteEnd = byteEnd
self.tokenCount = tokenCount
self.chunkIndex = chunkIndex
self.totalChunks = totalChunks
self.firstPage = firstPage
self.lastPage = lastPage
self.headingContext = headingContext
self.imageIndices = imageIndices
}
private enum CodingKeys: String, CodingKey {
case byteStart = "byte_start"
case byteEnd = "byte_end"
case tokenCount = "token_count"
case chunkIndex = "chunk_index"
case totalChunks = "total_chunks"
case firstPage = "first_page"
case lastPage = "last_page"
case headingContext = "heading_context"
case imageIndices = "image_indices"
}
}
// MARK: - Internal FFI conversions for ChunkMetadata
internal extension ChunkMetadata {
init(_ rb: RustBridge.ChunkMetadataRef) throws {
self.byteStart = rb.byteStart()
self.byteEnd = rb.byteEnd()
self.tokenCount = rb.tokenCount()
self.chunkIndex = rb.chunkIndex()
self.totalChunks = rb.totalChunks()
self.firstPage = rb.firstPage()
self.lastPage = rb.lastPage()
self.headingContext = try rb.headingContext().map { try HeadingContext($0) }
self.imageIndices = Array(rb.imageIndices())
}
func intoRust() throws -> RustBridge.ChunkMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.chunkMetadataFromJson(json)
}
}
/// Extracted image from a document.
///
/// Contains raw image data, metadata, and optional nested OCR results.
/// Raw bytes allow cross-language compatibility - users can convert to
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
public typealias ExtractedImage = RustBridge.ExtractedImage
/// Bounding box coordinates for element positioning.
public struct BoundingBox: Codable, Sendable, Hashable {
/// Left x-coordinate
public let x0: Double
/// Bottom y-coordinate
public let y0: Double
/// Right x-coordinate
public let x1: Double
/// Top y-coordinate
public let y1: Double
public init(x0: Double, y0: Double, x1: Double, y1: Double) {
self.x0 = x0
self.y0 = y0
self.x1 = x1
self.y1 = y1
}
private enum CodingKeys: String, CodingKey {
case x0 = "x0"
case y0 = "y0"
case x1 = "x1"
case y1 = "y1"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.x0 = try container.decodeIfPresent(Double.self, forKey: .x0) ?? 0
self.y0 = try container.decodeIfPresent(Double.self, forKey: .y0) ?? 0
self.x1 = try container.decodeIfPresent(Double.self, forKey: .x1) ?? 0
self.y1 = try container.decodeIfPresent(Double.self, forKey: .y1) ?? 0
}
}
// MARK: - Internal FFI conversions for BoundingBox
internal extension BoundingBox {
init(_ rb: RustBridge.BoundingBoxRef) throws {
self.x0 = rb.x0()
self.y0 = rb.y0()
self.x1 = rb.x1()
self.y1 = rb.y1()
}
func intoRust() throws -> RustBridge.BoundingBox {
return RustBridge.BoundingBox(self.x0, self.y0, self.x1, self.y1)
}
}
/// Metadata for a semantic element.
public typealias ElementMetadata = RustBridge.ElementMetadata
/// Semantic element extracted from document.
///
/// Represents a logical unit of content with semantic classification,
/// unique identifier, and metadata for tracking origin and position.
public typealias Element = RustBridge.Element
/// Excel workbook representation.
///
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
/// extracted content and metadata.
public typealias ExcelWorkbook = RustBridge.ExcelWorkbook
/// Single Excel worksheet.
///
/// Represents one sheet from an Excel workbook with its content
/// converted to Markdown format and dimensional statistics.
public struct ExcelSheet: Codable, Sendable, Hashable {
/// Sheet name as it appears in Excel
public let name: String
/// Sheet content converted to Markdown tables
public let markdown: String
/// Number of rows
public let rowCount: UInt
/// Number of columns
public let colCount: UInt
/// Total number of non-empty cells
public let cellCount: UInt
/// Pre-extracted table cells (2D vector of cell values)
/// Populated during markdown generation to avoid re-parsing markdown.
/// None for empty sheets.
public let tableCells: [[String]]?
public init(name: String, markdown: String, rowCount: UInt, colCount: UInt, cellCount: UInt, tableCells: [[String]]? = nil) {
self.name = name
self.markdown = markdown
self.rowCount = rowCount
self.colCount = colCount
self.cellCount = cellCount
self.tableCells = tableCells
}
private enum CodingKeys: String, CodingKey {
case name = "name"
case markdown = "markdown"
case rowCount = "row_count"
case colCount = "col_count"
case cellCount = "cell_count"
case tableCells = "table_cells"
}
}
// MARK: - Internal FFI conversions for ExcelSheet
internal extension ExcelSheet {
init(_ rb: RustBridge.ExcelSheetRef) throws {
self.name = rb.name().toString()
self.markdown = rb.markdown().toString()
self.rowCount = rb.rowCount()
self.colCount = rb.colCount()
self.cellCount = rb.cellCount()
self.tableCells = try JSONDecoder().decode([[String]]?.self, from: ((rb.tableCells()?.toString() ?? "null").data(using: .utf8) ?? Data("null".utf8)))
}
func intoRust() throws -> RustBridge.ExcelSheet {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.excelSheetFromJson(json)
}
}
/// XML extraction result.
///
/// Contains extracted text content from XML files along with
/// structural statistics about the XML document.
public struct XmlExtractionResult: Codable, Sendable, Hashable {
/// Extracted text content (XML structure filtered out)
public let content: String
/// Total number of XML elements processed
public let elementCount: UInt
/// List of unique element names found (sorted)
public let uniqueElements: [String]
public init(content: String, elementCount: UInt, uniqueElements: [String]) {
self.content = content
self.elementCount = elementCount
self.uniqueElements = uniqueElements
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case elementCount = "element_count"
case uniqueElements = "unique_elements"
}
}
// MARK: - Internal FFI conversions for XmlExtractionResult
internal extension XmlExtractionResult {
init(_ rb: RustBridge.XmlExtractionResultRef) throws {
self.content = rb.content().toString()
self.elementCount = rb.elementCount()
self.uniqueElements = rb.uniqueElements().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.XmlExtractionResult {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.xmlExtractionResultFromJson(json)
}
}
/// Plain text and Markdown extraction result.
///
/// Contains the extracted text along with statistics and,
/// for Markdown files, structural elements like headers and links.
public struct TextExtractionResult: Codable, Sendable, Hashable {
/// Extracted text content
public let content: String
/// Number of lines
public let lineCount: UInt
/// Number of words
public let wordCount: UInt
/// Number of characters
public let characterCount: UInt
/// Markdown headers (text only, Markdown files only)
public let headers: [String]?
/// Markdown links as (text, URL) tuples (Markdown files only)
public let links: [[String]]?
/// Code blocks as (language, code) tuples (Markdown files only)
public let codeBlocks: [[String]]?
public init(content: String, lineCount: UInt, wordCount: UInt, characterCount: UInt, headers: [String]? = nil, links: [[String]]? = nil, codeBlocks: [[String]]? = nil) {
self.content = content
self.lineCount = lineCount
self.wordCount = wordCount
self.characterCount = characterCount
self.headers = headers
self.links = links
self.codeBlocks = codeBlocks
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case lineCount = "line_count"
case wordCount = "word_count"
case characterCount = "character_count"
case headers = "headers"
case links = "links"
case codeBlocks = "code_blocks"
}
}
// MARK: - Internal FFI conversions for TextExtractionResult
internal extension TextExtractionResult {
init(_ rb: RustBridge.TextExtractionResultRef) throws {
self.content = rb.content().toString()
self.lineCount = rb.lineCount()
self.wordCount = rb.wordCount()
self.characterCount = rb.characterCount()
self.headers = rb.headers()?.map { $0.as_str().toString() }
self.links = nil
self.codeBlocks = nil
}
func intoRust() throws -> RustBridge.TextExtractionResult {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.textExtractionResultFromJson(json)
}
}
/// PowerPoint (PPTX) extraction result.
///
/// Contains extracted slide content, metadata, and embedded images/tables.
public typealias PptxExtractionResult = RustBridge.PptxExtractionResult
/// Email extraction result.
///
/// Complete representation of an extracted email message (.eml or .msg)
/// including headers, body content, and attachments.
public typealias EmailExtractionResult = RustBridge.EmailExtractionResult
/// Email attachment representation.
///
/// Contains metadata and optionally the content of an email attachment.
public typealias EmailAttachment = RustBridge.EmailAttachment
/// OCR extraction result.
///
/// Result of performing OCR on an image or scanned document,
/// including recognized text and detected tables.
public typealias OcrExtractionResult = RustBridge.OcrExtractionResult
/// Table detected via OCR.
///
/// Represents a table structure recognized during OCR processing.
public struct OcrTable: Codable, Sendable, Hashable {
/// Table cells as a 2D vector (rows × columns)
public let cells: [[String]]
/// Markdown representation of the table
public let markdown: String
/// Page number where the table was found (1-indexed)
public let pageNumber: UInt32
/// Bounding box of the table in pixel coordinates (from OCR word positions).
public let boundingBox: OcrTableBoundingBox?
public init(cells: [[String]], markdown: String, pageNumber: UInt32, boundingBox: OcrTableBoundingBox? = nil) {
self.cells = cells
self.markdown = markdown
self.pageNumber = pageNumber
self.boundingBox = boundingBox
}
private enum CodingKeys: String, CodingKey {
case cells = "cells"
case markdown = "markdown"
case pageNumber = "page_number"
case boundingBox = "bounding_box"
}
}
// MARK: - Internal FFI conversions for OcrTable
internal extension OcrTable {
init(_ rb: RustBridge.OcrTableRef) throws {
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
self.markdown = rb.markdown().toString()
self.pageNumber = rb.pageNumber()
self.boundingBox = try rb.boundingBox().map { try OcrTableBoundingBox($0) }
}
func intoRust() throws -> RustBridge.OcrTable {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.ocrTableFromJson(json)
}
}
/// Bounding box for an OCR-detected table in pixel coordinates.
public struct OcrTableBoundingBox: Codable, Sendable, Hashable {
/// Left x-coordinate (pixels)
public let left: UInt32
/// Top y-coordinate (pixels)
public let top: UInt32
/// Right x-coordinate (pixels)
public let right: UInt32
/// Bottom y-coordinate (pixels)
public let bottom: UInt32
public init(left: UInt32, top: UInt32, right: UInt32, bottom: UInt32) {
self.left = left
self.top = top
self.right = right
self.bottom = bottom
}
}
// MARK: - Internal FFI conversions for OcrTableBoundingBox
internal extension OcrTableBoundingBox {
init(_ rb: RustBridge.OcrTableBoundingBoxRef) throws {
self.left = rb.left()
self.top = rb.top()
self.right = rb.right()
self.bottom = rb.bottom()
}
func intoRust() throws -> RustBridge.OcrTableBoundingBox {
return RustBridge.OcrTableBoundingBox(self.left, self.top, self.right, self.bottom)
}
}
/// Image preprocessing configuration for OCR.
///
/// These settings control how images are preprocessed before OCR to improve
/// text recognition quality. Different preprocessing strategies work better
/// for different document types.
public struct ImagePreprocessingConfig: Codable, Sendable, Hashable {
/// Target DPI for the image (300 is standard, 600 for small text).
public let targetDpi: Int32
/// Auto-detect and correct image rotation.
public let autoRotate: Bool
/// Correct skew (tilted images).
public let deskew: Bool
/// Remove noise from the image.
public let denoise: Bool
/// Enhance contrast for better text visibility.
public let contrastEnhance: Bool
/// Binarization method: "otsu", "sauvola", "adaptive".
public let binarizationMethod: String
/// Invert colors (white text on black black on white).
public let invertColors: Bool
public init(targetDpi: Int32, autoRotate: Bool, deskew: Bool, denoise: Bool, contrastEnhance: Bool, binarizationMethod: String, invertColors: Bool) {
self.targetDpi = targetDpi
self.autoRotate = autoRotate
self.deskew = deskew
self.denoise = denoise
self.contrastEnhance = contrastEnhance
self.binarizationMethod = binarizationMethod
self.invertColors = invertColors
}
private enum CodingKeys: String, CodingKey {
case targetDpi = "target_dpi"
case autoRotate = "auto_rotate"
case deskew = "deskew"
case denoise = "denoise"
case contrastEnhance = "contrast_enhance"
case binarizationMethod = "binarization_method"
case invertColors = "invert_colors"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.targetDpi = try container.decodeIfPresent(Int32.self, forKey: .targetDpi) ?? 300
self.autoRotate = try container.decodeIfPresent(Bool.self, forKey: .autoRotate) ?? true
self.deskew = try container.decodeIfPresent(Bool.self, forKey: .deskew) ?? true
self.denoise = try container.decodeIfPresent(Bool.self, forKey: .denoise) ?? false
self.contrastEnhance = try container.decodeIfPresent(Bool.self, forKey: .contrastEnhance) ?? false
self.binarizationMethod = try container.decodeIfPresent(String.self, forKey: .binarizationMethod) ?? "otsu"
self.invertColors = try container.decodeIfPresent(Bool.self, forKey: .invertColors) ?? false
}
}
// MARK: - Internal FFI conversions for ImagePreprocessingConfig
internal extension ImagePreprocessingConfig {
init(_ rb: RustBridge.ImagePreprocessingConfigRef) throws {
self.targetDpi = rb.targetDpi()
self.autoRotate = rb.autoRotate()
self.deskew = rb.deskew()
self.denoise = rb.denoise()
self.contrastEnhance = rb.contrastEnhance()
self.binarizationMethod = rb.binarizationMethod().toString()
self.invertColors = rb.invertColors()
}
func intoRust() throws -> RustBridge.ImagePreprocessingConfig {
return RustBridge.ImagePreprocessingConfig(self.targetDpi, self.autoRotate, self.deskew, self.denoise, self.contrastEnhance, RustString(self.binarizationMethod), self.invertColors)
}
}
/// Tesseract OCR configuration.
///
/// Provides fine-grained control over Tesseract OCR engine parameters.
/// Most users can use the defaults, but these settings allow optimization
/// for specific document types (invoices, handwriting, etc.).
public struct TesseractConfig: Codable, Sendable, Hashable {
/// Language code (e.g., "eng", "deu", "fra")
public let language: String
/// Page Segmentation Mode (0-13).
///
/// Common values:
/// - 3: Fully automatic page segmentation (native default)
/// - 6: Assume a single uniform block of text (WASM default avoids layout-analysis hang)
/// - 11: Sparse text with no particular order
public let psm: Int32
/// Output format ("text" or "markdown")
public let outputFormat: String
/// OCR Engine Mode (0-3).
///
/// - 0: Legacy engine only
/// - 1: Neural nets (LSTM) only (usually best)
/// - 2: Legacy + LSTM
/// - 3: Default (based on what's available)
public let oem: Int32
/// Minimum confidence threshold (0.0-100.0).
///
/// Words with confidence below this threshold may be rejected or flagged.
public let minConfidence: Double
/// Image preprocessing configuration.
///
/// Controls how images are preprocessed before OCR. Can significantly
/// improve quality for scanned documents or low-quality images.
public let preprocessing: ImagePreprocessingConfig?
/// Enable automatic table detection and reconstruction
public let enableTableDetection: Bool
/// Minimum confidence threshold for table detection (0.0-1.0)
public let tableMinConfidence: Double
/// Column threshold for table detection (pixels)
public let tableColumnThreshold: Int32
/// Row threshold ratio for table detection (0.0-1.0)
public let tableRowThresholdRatio: Double
/// Enable OCR result caching
public let useCache: Bool
/// Use pre-adapted templates for character classification
public let classifyUsePreAdaptedTemplates: Bool
/// Enable N-gram language model
public let languageModelNgramOn: Bool
/// Don't reject good words during block-level processing
public let tesseditDontBlkrejGoodWds: Bool
/// Don't reject good words during row-level processing
public let tesseditDontRowrejGoodWds: Bool
/// Enable dictionary correction
public let tesseditEnableDictCorrection: Bool
/// Whitelist of allowed characters (empty = all allowed)
public let tesseditCharWhitelist: String
/// Blacklist of forbidden characters (empty = none forbidden)
public let tesseditCharBlacklist: String
/// Use primary language params model
public let tesseditUsePrimaryParamsModel: Bool
/// Variable-width space detection
public let textordSpaceSizeIsVariable: Bool
/// Use adaptive thresholding method
public let thresholdingMethod: Bool
public init(language: String, psm: Int32, outputFormat: String, oem: Int32, minConfidence: Double, preprocessing: ImagePreprocessingConfig? = nil, enableTableDetection: Bool, tableMinConfidence: Double, tableColumnThreshold: Int32, tableRowThresholdRatio: Double, useCache: Bool, classifyUsePreAdaptedTemplates: Bool, languageModelNgramOn: Bool, tesseditDontBlkrejGoodWds: Bool, tesseditDontRowrejGoodWds: Bool, tesseditEnableDictCorrection: Bool, tesseditCharWhitelist: String, tesseditCharBlacklist: String, tesseditUsePrimaryParamsModel: Bool, textordSpaceSizeIsVariable: Bool, thresholdingMethod: Bool) {
self.language = language
self.psm = psm
self.outputFormat = outputFormat
self.oem = oem
self.minConfidence = minConfidence
self.preprocessing = preprocessing
self.enableTableDetection = enableTableDetection
self.tableMinConfidence = tableMinConfidence
self.tableColumnThreshold = tableColumnThreshold
self.tableRowThresholdRatio = tableRowThresholdRatio
self.useCache = useCache
self.classifyUsePreAdaptedTemplates = classifyUsePreAdaptedTemplates
self.languageModelNgramOn = languageModelNgramOn
self.tesseditDontBlkrejGoodWds = tesseditDontBlkrejGoodWds
self.tesseditDontRowrejGoodWds = tesseditDontRowrejGoodWds
self.tesseditEnableDictCorrection = tesseditEnableDictCorrection
self.tesseditCharWhitelist = tesseditCharWhitelist
self.tesseditCharBlacklist = tesseditCharBlacklist
self.tesseditUsePrimaryParamsModel = tesseditUsePrimaryParamsModel
self.textordSpaceSizeIsVariable = textordSpaceSizeIsVariable
self.thresholdingMethod = thresholdingMethod
}
private enum CodingKeys: String, CodingKey {
case language = "language"
case psm = "psm"
case outputFormat = "output_format"
case oem = "oem"
case minConfidence = "min_confidence"
case preprocessing = "preprocessing"
case enableTableDetection = "enable_table_detection"
case tableMinConfidence = "table_min_confidence"
case tableColumnThreshold = "table_column_threshold"
case tableRowThresholdRatio = "table_row_threshold_ratio"
case useCache = "use_cache"
case classifyUsePreAdaptedTemplates = "classify_use_pre_adapted_templates"
case languageModelNgramOn = "language_model_ngram_on"
case tesseditDontBlkrejGoodWds = "tessedit_dont_blkrej_good_wds"
case tesseditDontRowrejGoodWds = "tessedit_dont_rowrej_good_wds"
case tesseditEnableDictCorrection = "tessedit_enable_dict_correction"
case tesseditCharWhitelist = "tessedit_char_whitelist"
case tesseditCharBlacklist = "tessedit_char_blacklist"
case tesseditUsePrimaryParamsModel = "tessedit_use_primary_params_model"
case textordSpaceSizeIsVariable = "textord_space_size_is_variable"
case thresholdingMethod = "thresholding_method"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? "eng"
self.psm = try container.decodeIfPresent(Int32.self, forKey: .psm) ?? 3
self.outputFormat = try container.decodeIfPresent(String.self, forKey: .outputFormat) ?? "markdown"
self.oem = try container.decodeIfPresent(Int32.self, forKey: .oem) ?? 3
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0.0
self.preprocessing = try container.decodeIfPresent(ImagePreprocessingConfig.self, forKey: .preprocessing) ?? nil
self.enableTableDetection = try container.decodeIfPresent(Bool.self, forKey: .enableTableDetection) ?? true
self.tableMinConfidence = try container.decodeIfPresent(Double.self, forKey: .tableMinConfidence) ?? 0.0
self.tableColumnThreshold = try container.decodeIfPresent(Int32.self, forKey: .tableColumnThreshold) ?? 50
self.tableRowThresholdRatio = try container.decodeIfPresent(Double.self, forKey: .tableRowThresholdRatio) ?? 0.5
self.useCache = try container.decodeIfPresent(Bool.self, forKey: .useCache) ?? true
self.classifyUsePreAdaptedTemplates = try container.decodeIfPresent(Bool.self, forKey: .classifyUsePreAdaptedTemplates) ?? true
self.languageModelNgramOn = try container.decodeIfPresent(Bool.self, forKey: .languageModelNgramOn) ?? false
self.tesseditDontBlkrejGoodWds = try container.decodeIfPresent(Bool.self, forKey: .tesseditDontBlkrejGoodWds) ?? true
self.tesseditDontRowrejGoodWds = try container.decodeIfPresent(Bool.self, forKey: .tesseditDontRowrejGoodWds) ?? true
self.tesseditEnableDictCorrection = try container.decodeIfPresent(Bool.self, forKey: .tesseditEnableDictCorrection) ?? true
self.tesseditCharWhitelist = try container.decodeIfPresent(String.self, forKey: .tesseditCharWhitelist) ?? ""
self.tesseditCharBlacklist = try container.decodeIfPresent(String.self, forKey: .tesseditCharBlacklist) ?? ""
self.tesseditUsePrimaryParamsModel = try container.decodeIfPresent(Bool.self, forKey: .tesseditUsePrimaryParamsModel) ?? true
self.textordSpaceSizeIsVariable = try container.decodeIfPresent(Bool.self, forKey: .textordSpaceSizeIsVariable) ?? true
self.thresholdingMethod = try container.decodeIfPresent(Bool.self, forKey: .thresholdingMethod) ?? false
}
}
// MARK: - Internal FFI conversions for TesseractConfig
internal extension TesseractConfig {
init(_ rb: RustBridge.TesseractConfigRef) throws {
self.language = rb.language().toString()
self.psm = rb.psm()
self.outputFormat = rb.outputFormat().toString()
self.oem = rb.oem()
self.minConfidence = rb.minConfidence()
self.preprocessing = try rb.preprocessing().map { try ImagePreprocessingConfig($0) }
self.enableTableDetection = rb.enableTableDetection()
self.tableMinConfidence = rb.tableMinConfidence()
self.tableColumnThreshold = rb.tableColumnThreshold()
self.tableRowThresholdRatio = rb.tableRowThresholdRatio()
self.useCache = rb.useCache()
self.classifyUsePreAdaptedTemplates = rb.classifyUsePreAdaptedTemplates()
self.languageModelNgramOn = rb.languageModelNgramOn()
self.tesseditDontBlkrejGoodWds = rb.tesseditDontBlkrejGoodWds()
self.tesseditDontRowrejGoodWds = rb.tesseditDontRowrejGoodWds()
self.tesseditEnableDictCorrection = rb.tesseditEnableDictCorrection()
self.tesseditCharWhitelist = rb.tesseditCharWhitelist().toString()
self.tesseditCharBlacklist = rb.tesseditCharBlacklist().toString()
self.tesseditUsePrimaryParamsModel = rb.tesseditUsePrimaryParamsModel()
self.textordSpaceSizeIsVariable = rb.textordSpaceSizeIsVariable()
self.thresholdingMethod = rb.thresholdingMethod()
}
func intoRust() throws -> RustBridge.TesseractConfig {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.tesseractConfigFromJson(json)
}
}
/// Image preprocessing metadata.
///
/// Tracks the transformations applied to an image during OCR preprocessing,
/// including DPI normalization, resizing, and resampling.
public struct ImagePreprocessingMetadata: Codable, Sendable, Hashable {
/// Original image dimensions (width, height) in pixels
public let originalDimensions: [UInt]
/// Original image DPI (horizontal, vertical)
public let originalDpi: [Double]
/// Target DPI from configuration
public let targetDpi: Int32
/// Scaling factor applied to the image
public let scaleFactor: Double
/// Whether DPI was auto-adjusted based on content
public let autoAdjusted: Bool
/// Final DPI after processing
public let finalDpi: Int32
/// New dimensions after resizing (if resized)
public let newDimensions: [UInt]?
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
public let resampleMethod: String
/// Whether dimensions were clamped to max_image_dimension
public let dimensionClamped: Bool
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
public let calculatedDpi: Int32?
/// Whether resize was skipped (dimensions already optimal)
public let skippedResize: Bool
/// Error message if resize failed
public let resizeError: String?
public init(originalDimensions: [UInt], originalDpi: [Double], targetDpi: Int32, scaleFactor: Double, autoAdjusted: Bool, finalDpi: Int32, newDimensions: [UInt]? = nil, resampleMethod: String, dimensionClamped: Bool, calculatedDpi: Int32? = nil, skippedResize: Bool, resizeError: String? = nil) {
self.originalDimensions = originalDimensions
self.originalDpi = originalDpi
self.targetDpi = targetDpi
self.scaleFactor = scaleFactor
self.autoAdjusted = autoAdjusted
self.finalDpi = finalDpi
self.newDimensions = newDimensions
self.resampleMethod = resampleMethod
self.dimensionClamped = dimensionClamped
self.calculatedDpi = calculatedDpi
self.skippedResize = skippedResize
self.resizeError = resizeError
}
private enum CodingKeys: String, CodingKey {
case originalDimensions = "original_dimensions"
case originalDpi = "original_dpi"
case targetDpi = "target_dpi"
case scaleFactor = "scale_factor"
case autoAdjusted = "auto_adjusted"
case finalDpi = "final_dpi"
case newDimensions = "new_dimensions"
case resampleMethod = "resample_method"
case dimensionClamped = "dimension_clamped"
case calculatedDpi = "calculated_dpi"
case skippedResize = "skipped_resize"
case resizeError = "resize_error"
}
}
// MARK: - Internal FFI conversions for ImagePreprocessingMetadata
internal extension ImagePreprocessingMetadata {
init(_ rb: RustBridge.ImagePreprocessingMetadataRef) throws {
self.originalDimensions = Array(rb.originalDimensions())
self.originalDpi = Array(rb.originalDpi())
self.targetDpi = rb.targetDpi()
self.scaleFactor = rb.scaleFactor()
self.autoAdjusted = rb.autoAdjusted()
self.finalDpi = rb.finalDpi()
self.newDimensions = rb.newDimensions().map { Array($0) }
self.resampleMethod = rb.resampleMethod().toString()
self.dimensionClamped = rb.dimensionClamped()
self.calculatedDpi = rb.calculatedDpi()
self.skippedResize = rb.skippedResize()
self.resizeError = rb.resizeError()?.toString()
}
func intoRust() throws -> RustBridge.ImagePreprocessingMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.imagePreprocessingMetadataFromJson(json)
}
}
/// Extraction result metadata.
///
/// Contains common fields applicable to all formats, format-specific metadata
/// via a discriminated union, and additional custom fields from postprocessors.
public typealias Metadata = RustBridge.Metadata
/// Excel/spreadsheet format metadata.
///
/// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
/// discriminant. Sheet count and sheet names are stored inside this struct.
public struct ExcelMetadata: Codable, Sendable, Hashable {
/// Number of sheets in the workbook.
public let sheetCount: UInt32?
/// Names of all sheets in the workbook.
public let sheetNames: [String]?
public init(sheetCount: UInt32? = nil, sheetNames: [String]? = nil) {
self.sheetCount = sheetCount
self.sheetNames = sheetNames
}
private enum CodingKeys: String, CodingKey {
case sheetCount = "sheet_count"
case sheetNames = "sheet_names"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.sheetCount = try container.decodeIfPresent(UInt32.self, forKey: .sheetCount) ?? nil
self.sheetNames = try container.decodeIfPresent([String].self, forKey: .sheetNames) ?? nil
}
}
// MARK: - Internal FFI conversions for ExcelMetadata
internal extension ExcelMetadata {
init(_ rb: RustBridge.ExcelMetadataRef) throws {
self.sheetCount = rb.sheetCount()
self.sheetNames = rb.sheetNames()?.map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.ExcelMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.excelMetadataFromJson(json)
}
}
/// Email metadata extracted from .eml and .msg files.
///
/// Includes sender/recipient information, message ID, and attachment list.
public struct EmailMetadata: Codable, Sendable, Hashable {
/// Sender's email address
public let fromEmail: String?
/// Sender's display name
public let fromName: String?
/// Primary recipients
public let toEmails: [String]
/// CC recipients
public let ccEmails: [String]
/// BCC recipients
public let bccEmails: [String]
/// Message-ID header value
public let messageId: String?
/// List of attachment filenames
public let attachments: [String]
public init(fromEmail: String? = nil, fromName: String? = nil, toEmails: [String], ccEmails: [String], bccEmails: [String], messageId: String? = nil, attachments: [String]) {
self.fromEmail = fromEmail
self.fromName = fromName
self.toEmails = toEmails
self.ccEmails = ccEmails
self.bccEmails = bccEmails
self.messageId = messageId
self.attachments = attachments
}
private enum CodingKeys: String, CodingKey {
case fromEmail = "from_email"
case fromName = "from_name"
case toEmails = "to_emails"
case ccEmails = "cc_emails"
case bccEmails = "bcc_emails"
case messageId = "message_id"
case attachments = "attachments"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.fromEmail = try container.decodeIfPresent(String.self, forKey: .fromEmail) ?? nil
self.fromName = try container.decodeIfPresent(String.self, forKey: .fromName) ?? nil
self.toEmails = try container.decodeIfPresent([String].self, forKey: .toEmails) ?? []
self.ccEmails = try container.decodeIfPresent([String].self, forKey: .ccEmails) ?? []
self.bccEmails = try container.decodeIfPresent([String].self, forKey: .bccEmails) ?? []
self.messageId = try container.decodeIfPresent(String.self, forKey: .messageId) ?? nil
self.attachments = try container.decodeIfPresent([String].self, forKey: .attachments) ?? []
}
}
// MARK: - Internal FFI conversions for EmailMetadata
internal extension EmailMetadata {
init(_ rb: RustBridge.EmailMetadataRef) throws {
self.fromEmail = rb.fromEmail()?.toString()
self.fromName = rb.fromName()?.toString()
self.toEmails = rb.toEmails().map { $0.as_str().toString() }
self.ccEmails = rb.ccEmails().map { $0.as_str().toString() }
self.bccEmails = rb.bccEmails().map { $0.as_str().toString() }
self.messageId = rb.messageId()?.toString()
self.attachments = rb.attachments().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.EmailMetadata {
let __toEmails = RustVec<RustString>()
for __elem in self.toEmails { __toEmails.push(value: RustString(__elem)) }
let __ccEmails = RustVec<RustString>()
for __elem in self.ccEmails { __ccEmails.push(value: RustString(__elem)) }
let __bccEmails = RustVec<RustString>()
for __elem in self.bccEmails { __bccEmails.push(value: RustString(__elem)) }
let __attachments = RustVec<RustString>()
for __elem in self.attachments { __attachments.push(value: RustString(__elem)) }
return RustBridge.EmailMetadata(self.fromEmail.map(RustString.init), self.fromName.map(RustString.init), __toEmails, __ccEmails, __bccEmails, self.messageId.map(RustString.init), __attachments)
}
}
/// Archive (ZIP/TAR/7Z) metadata.
///
/// Extracted from compressed archive files containing file lists and size information.
public struct ArchiveMetadata: Codable, Sendable, Hashable {
/// Archive format ("ZIP", "TAR", "7Z", etc.)
public let format: String
/// Total number of files in the archive
public let fileCount: UInt32
/// List of file paths within the archive
public let fileList: [String]
/// Total uncompressed size in bytes
public let totalSize: UInt64
/// Compressed size in bytes (if available)
public let compressedSize: UInt64?
public init(format: String, fileCount: UInt32, fileList: [String], totalSize: UInt64, compressedSize: UInt64? = nil) {
self.format = format
self.fileCount = fileCount
self.fileList = fileList
self.totalSize = totalSize
self.compressedSize = compressedSize
}
private enum CodingKeys: String, CodingKey {
case format = "format"
case fileCount = "file_count"
case fileList = "file_list"
case totalSize = "total_size"
case compressedSize = "compressed_size"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.format = try container.decodeIfPresent(String.self, forKey: .format) ?? ""
self.fileCount = try container.decodeIfPresent(UInt32.self, forKey: .fileCount) ?? 0
self.fileList = try container.decodeIfPresent([String].self, forKey: .fileList) ?? []
self.totalSize = try container.decodeIfPresent(UInt64.self, forKey: .totalSize) ?? 0
self.compressedSize = try container.decodeIfPresent(UInt64.self, forKey: .compressedSize) ?? nil
}
}
// MARK: - Internal FFI conversions for ArchiveMetadata
internal extension ArchiveMetadata {
init(_ rb: RustBridge.ArchiveMetadataRef) throws {
self.format = rb.format().toString()
self.fileCount = rb.fileCount()
self.fileList = rb.fileList().map { $0.as_str().toString() }
self.totalSize = rb.totalSize()
self.compressedSize = rb.compressedSize()
}
func intoRust() throws -> RustBridge.ArchiveMetadata {
let __fileList = RustVec<RustString>()
for __elem in self.fileList { __fileList.push(value: RustString(__elem)) }
return RustBridge.ArchiveMetadata(RustString(self.format), self.fileCount, __fileList, self.totalSize, self.compressedSize)
}
}
/// Image metadata extracted from image files.
///
/// Includes dimensions, format, and EXIF data.
public typealias ImageMetadata = RustBridge.ImageMetadata
/// XML metadata extracted during XML parsing.
///
/// Provides statistics about XML document structure.
public struct XmlMetadata: Codable, Sendable, Hashable {
/// Total number of XML elements processed
public let elementCount: UInt32
/// List of unique element tag names (sorted)
public let uniqueElements: [String]
public init(elementCount: UInt32, uniqueElements: [String]) {
self.elementCount = elementCount
self.uniqueElements = uniqueElements
}
private enum CodingKeys: String, CodingKey {
case elementCount = "element_count"
case uniqueElements = "unique_elements"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.elementCount = try container.decodeIfPresent(UInt32.self, forKey: .elementCount) ?? 0
self.uniqueElements = try container.decodeIfPresent([String].self, forKey: .uniqueElements) ?? []
}
}
// MARK: - Internal FFI conversions for XmlMetadata
internal extension XmlMetadata {
init(_ rb: RustBridge.XmlMetadataRef) throws {
self.elementCount = rb.elementCount()
self.uniqueElements = rb.uniqueElements().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.XmlMetadata {
let __uniqueElements = RustVec<RustString>()
for __elem in self.uniqueElements { __uniqueElements.push(value: RustString(__elem)) }
return RustBridge.XmlMetadata(self.elementCount, __uniqueElements)
}
}
/// Text/Markdown metadata.
///
/// Extracted from plain text and Markdown files. Includes word counts and,
/// for Markdown, structural elements like headers and links.
public struct TextMetadata: Codable, Sendable, Hashable {
/// Number of lines in the document
public let lineCount: UInt32
/// Number of words
public let wordCount: UInt32
/// Number of characters
public let characterCount: UInt32
/// Markdown headers (headings text only, for Markdown files)
public let headers: [String]?
/// Markdown links as (text, url) tuples (for Markdown files)
public let links: [[String]]?
/// Code blocks as (language, code) tuples (for Markdown files)
public let codeBlocks: [[String]]?
public init(lineCount: UInt32, wordCount: UInt32, characterCount: UInt32, headers: [String]? = nil, links: [[String]]? = nil, codeBlocks: [[String]]? = nil) {
self.lineCount = lineCount
self.wordCount = wordCount
self.characterCount = characterCount
self.headers = headers
self.links = links
self.codeBlocks = codeBlocks
}
private enum CodingKeys: String, CodingKey {
case lineCount = "line_count"
case wordCount = "word_count"
case characterCount = "character_count"
case headers = "headers"
case links = "links"
case codeBlocks = "code_blocks"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.lineCount = try container.decodeIfPresent(UInt32.self, forKey: .lineCount) ?? 0
self.wordCount = try container.decodeIfPresent(UInt32.self, forKey: .wordCount) ?? 0
self.characterCount = try container.decodeIfPresent(UInt32.self, forKey: .characterCount) ?? 0
self.headers = try container.decodeIfPresent([String].self, forKey: .headers) ?? nil
self.links = try container.decodeIfPresent([[String]].self, forKey: .links) ?? nil
self.codeBlocks = try container.decodeIfPresent([[String]].self, forKey: .codeBlocks) ?? nil
}
}
// MARK: - Internal FFI conversions for TextMetadata
internal extension TextMetadata {
init(_ rb: RustBridge.TextMetadataRef) throws {
self.lineCount = rb.lineCount()
self.wordCount = rb.wordCount()
self.characterCount = rb.characterCount()
self.headers = rb.headers()?.map { $0.as_str().toString() }
self.links = nil
self.codeBlocks = nil
}
func intoRust() throws -> RustBridge.TextMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.textMetadataFromJson(json)
}
}
/// Header/heading element metadata.
public struct HeaderMetadata: Codable, Sendable, Hashable {
/// Header level: 1 (h1) through 6 (h6)
public let level: UInt8
/// Normalized text content of the header
public let text: String
/// HTML id attribute if present
public let id: String?
/// Document tree depth at the header element
public let depth: UInt32
/// Byte offset in original HTML document
public let htmlOffset: UInt32
public init(level: UInt8, text: String, id: String? = nil, depth: UInt32, htmlOffset: UInt32) {
self.level = level
self.text = text
self.id = id
self.depth = depth
self.htmlOffset = htmlOffset
}
private enum CodingKeys: String, CodingKey {
case level = "level"
case text = "text"
case id = "id"
case depth = "depth"
case htmlOffset = "html_offset"
}
}
// MARK: - Internal FFI conversions for HeaderMetadata
internal extension HeaderMetadata {
init(_ rb: RustBridge.HeaderMetadataRef) throws {
self.level = rb.level()
self.text = rb.text().toString()
self.id = rb.id()?.toString()
self.depth = rb.depth()
self.htmlOffset = rb.htmlOffset()
}
func intoRust() throws -> RustBridge.HeaderMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.headerMetadataFromJson(json)
}
}
/// Link element metadata.
public struct LinkMetadata: Codable, Sendable, Hashable {
/// The href URL value
public let href: String
/// Link text content (normalized)
public let text: String
/// Optional title attribute
public let title: String?
/// Link type classification
public let linkType: LinkType
/// Rel attribute values
public let rel: [String]
/// Additional attributes as key-value pairs
public let attributes: [[String]]
public init(href: String, text: String, title: String? = nil, linkType: LinkType, rel: [String], attributes: [[String]]) {
self.href = href
self.text = text
self.title = title
self.linkType = linkType
self.rel = rel
self.attributes = attributes
}
private enum CodingKeys: String, CodingKey {
case href = "href"
case text = "text"
case title = "title"
case linkType = "link_type"
case rel = "rel"
case attributes = "attributes"
}
}
// MARK: - Internal FFI conversions for LinkMetadata
internal extension LinkMetadata {
init(_ rb: RustBridge.LinkMetadataRef) throws {
self.href = rb.href().toString()
self.text = rb.text().toString()
self.title = rb.title()?.toString()
self.linkType = LinkType(rawValue: rb.linkType().toString()) ?? { fatalError("Unknown LinkType: \(rb.linkType().toString())") }()
self.rel = rb.rel().map { $0.as_str().toString() }
self.attributes = try JSONDecoder().decode([[String]].self, from: Data("null".utf8))
}
func intoRust() throws -> RustBridge.LinkMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.linkMetadataFromJson(json)
}
}
/// Image element metadata.
public struct ImageMetadataType: Codable, Sendable, Hashable {
/// Image source (URL, data URI, or SVG content)
public let src: String
/// Alternative text from alt attribute
public let alt: String?
/// Title attribute
public let title: String?
/// Image dimensions as (width, height) if available
public let dimensions: [UInt32]?
/// Image type classification
public let imageType: ImageType
/// Additional attributes as key-value pairs
public let attributes: [[String]]
public init(src: String, alt: String? = nil, title: String? = nil, dimensions: [UInt32]? = nil, imageType: ImageType, attributes: [[String]]) {
self.src = src
self.alt = alt
self.title = title
self.dimensions = dimensions
self.imageType = imageType
self.attributes = attributes
}
private enum CodingKeys: String, CodingKey {
case src = "src"
case alt = "alt"
case title = "title"
case dimensions = "dimensions"
case imageType = "image_type"
case attributes = "attributes"
}
}
// MARK: - Internal FFI conversions for ImageMetadataType
internal extension ImageMetadataType {
init(_ rb: RustBridge.ImageMetadataTypeRef) throws {
self.src = rb.src().toString()
self.alt = rb.alt()?.toString()
self.title = rb.title()?.toString()
self.dimensions = rb.dimensions().map { Array($0) }
self.imageType = ImageType(rawValue: rb.imageType().toString()) ?? { fatalError("Unknown ImageType: \(rb.imageType().toString())") }()
self.attributes = try JSONDecoder().decode([[String]].self, from: Data("null".utf8))
}
func intoRust() throws -> RustBridge.ImageMetadataType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.imageMetadataTypeFromJson(json)
}
}
/// Structured data (Schema.org, microdata, RDFa) block.
public struct StructuredData: Codable, Sendable, Hashable {
/// Type of structured data
public let dataType: StructuredDataType
/// Raw JSON string representation
public let rawJson: String
/// Schema type if detectable (e.g., "Article", "Event", "Product")
public let schemaType: String?
public init(dataType: StructuredDataType, rawJson: String, schemaType: String? = nil) {
self.dataType = dataType
self.rawJson = rawJson
self.schemaType = schemaType
}
private enum CodingKeys: String, CodingKey {
case dataType = "data_type"
case rawJson = "raw_json"
case schemaType = "schema_type"
}
}
// MARK: - Internal FFI conversions for StructuredData
internal extension StructuredData {
init(_ rb: RustBridge.StructuredDataRef) throws {
self.dataType = StructuredDataType(rawValue: rb.dataType().toString()) ?? { fatalError("Unknown StructuredDataType: \(rb.dataType().toString())") }()
self.rawJson = rb.rawJson().toString()
self.schemaType = rb.schemaType()?.toString()
}
func intoRust() throws -> RustBridge.StructuredData {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.structuredDataFromJson(json)
}
}
/// HTML metadata extracted from HTML documents.
///
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
/// and extracted structural elements (headers, links, images, structured data).
public typealias HtmlMetadata = RustBridge.HtmlMetadata
/// OCR processing metadata.
///
/// Captures information about OCR processing configuration and results.
public struct OcrMetadata: Codable, Sendable, Hashable {
/// OCR language code(s) used
public let language: String
/// Tesseract Page Segmentation Mode (PSM)
public let psm: Int32
/// Output format (e.g., "text", "hocr")
public let outputFormat: String
/// Number of tables detected
public let tableCount: UInt32
public let tableRows: UInt32?
public let tableCols: UInt32?
public init(language: String, psm: Int32, outputFormat: String, tableCount: UInt32, tableRows: UInt32? = nil, tableCols: UInt32? = nil) {
self.language = language
self.psm = psm
self.outputFormat = outputFormat
self.tableCount = tableCount
self.tableRows = tableRows
self.tableCols = tableCols
}
private enum CodingKeys: String, CodingKey {
case language = "language"
case psm = "psm"
case outputFormat = "output_format"
case tableCount = "table_count"
case tableRows = "table_rows"
case tableCols = "table_cols"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? ""
self.psm = try container.decodeIfPresent(Int32.self, forKey: .psm) ?? 0
self.outputFormat = try container.decodeIfPresent(String.self, forKey: .outputFormat) ?? ""
self.tableCount = try container.decodeIfPresent(UInt32.self, forKey: .tableCount) ?? 0
self.tableRows = try container.decodeIfPresent(UInt32.self, forKey: .tableRows) ?? nil
self.tableCols = try container.decodeIfPresent(UInt32.self, forKey: .tableCols) ?? nil
}
}
// MARK: - Internal FFI conversions for OcrMetadata
internal extension OcrMetadata {
init(_ rb: RustBridge.OcrMetadataRef) throws {
self.language = rb.language().toString()
self.psm = rb.psm()
self.outputFormat = rb.outputFormat().toString()
self.tableCount = rb.tableCount()
self.tableRows = rb.tableRows()
self.tableCols = rb.tableCols()
}
func intoRust() throws -> RustBridge.OcrMetadata {
return RustBridge.OcrMetadata(RustString(self.language), self.psm, RustString(self.outputFormat), self.tableCount, self.tableRows, self.tableCols)
}
}
/// Error metadata (for batch operations).
public struct ErrorMetadata: Codable, Sendable, Hashable {
public let errorType: String
public let message: String
public init(errorType: String, message: String) {
self.errorType = errorType
self.message = message
}
private enum CodingKeys: String, CodingKey {
case errorType = "error_type"
case message = "message"
}
}
// MARK: - Internal FFI conversions for ErrorMetadata
internal extension ErrorMetadata {
init(_ rb: RustBridge.ErrorMetadataRef) throws {
self.errorType = rb.errorType().toString()
self.message = rb.message().toString()
}
func intoRust() throws -> RustBridge.ErrorMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.errorMetadataFromJson(json)
}
}
/// PowerPoint presentation metadata.
///
/// Extracted from PPTX files containing slide counts and presentation details.
public struct PptxMetadata: Codable, Sendable, Hashable {
/// Total number of slides in the presentation
public let slideCount: UInt32
/// Names of slides (if available)
public let slideNames: [String]
/// Number of embedded images
public let imageCount: UInt32?
/// Number of tables
public let tableCount: UInt32?
public init(slideCount: UInt32, slideNames: [String], imageCount: UInt32? = nil, tableCount: UInt32? = nil) {
self.slideCount = slideCount
self.slideNames = slideNames
self.imageCount = imageCount
self.tableCount = tableCount
}
private enum CodingKeys: String, CodingKey {
case slideCount = "slide_count"
case slideNames = "slide_names"
case imageCount = "image_count"
case tableCount = "table_count"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.slideCount = try container.decodeIfPresent(UInt32.self, forKey: .slideCount) ?? 0
self.slideNames = try container.decodeIfPresent([String].self, forKey: .slideNames) ?? []
self.imageCount = try container.decodeIfPresent(UInt32.self, forKey: .imageCount) ?? nil
self.tableCount = try container.decodeIfPresent(UInt32.self, forKey: .tableCount) ?? nil
}
}
// MARK: - Internal FFI conversions for PptxMetadata
internal extension PptxMetadata {
init(_ rb: RustBridge.PptxMetadataRef) throws {
self.slideCount = rb.slideCount()
self.slideNames = rb.slideNames().map { $0.as_str().toString() }
self.imageCount = rb.imageCount()
self.tableCount = rb.tableCount()
}
func intoRust() throws -> RustBridge.PptxMetadata {
let __slideNames = RustVec<RustString>()
for __elem in self.slideNames { __slideNames.push(value: RustString(__elem)) }
return RustBridge.PptxMetadata(self.slideCount, __slideNames, self.imageCount, self.tableCount)
}
}
/// Word document metadata.
///
/// Extracted from DOCX files using shared Office Open XML metadata extraction.
/// Integrates with `office_metadata` module for core/app/custom properties.
public typealias DocxMetadata = RustBridge.DocxMetadata
/// CSV/TSV file metadata.
public struct CsvMetadata: Codable, Sendable, Hashable {
public let rowCount: UInt32
public let columnCount: UInt32
public let delimiter: String?
public let hasHeader: Bool
public let columnTypes: [String]?
public init(rowCount: UInt32, columnCount: UInt32, delimiter: String? = nil, hasHeader: Bool, columnTypes: [String]? = nil) {
self.rowCount = rowCount
self.columnCount = columnCount
self.delimiter = delimiter
self.hasHeader = hasHeader
self.columnTypes = columnTypes
}
private enum CodingKeys: String, CodingKey {
case rowCount = "row_count"
case columnCount = "column_count"
case delimiter = "delimiter"
case hasHeader = "has_header"
case columnTypes = "column_types"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.rowCount = try container.decodeIfPresent(UInt32.self, forKey: .rowCount) ?? 0
self.columnCount = try container.decodeIfPresent(UInt32.self, forKey: .columnCount) ?? 0
self.delimiter = try container.decodeIfPresent(String.self, forKey: .delimiter) ?? nil
self.hasHeader = try container.decodeIfPresent(Bool.self, forKey: .hasHeader) ?? false
self.columnTypes = try container.decodeIfPresent([String].self, forKey: .columnTypes) ?? nil
}
}
// MARK: - Internal FFI conversions for CsvMetadata
internal extension CsvMetadata {
init(_ rb: RustBridge.CsvMetadataRef) throws {
self.rowCount = rb.rowCount()
self.columnCount = rb.columnCount()
self.delimiter = rb.delimiter()?.toString()
self.hasHeader = rb.hasHeader()
self.columnTypes = rb.columnTypes()?.map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.CsvMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.csvMetadataFromJson(json)
}
}
/// BibTeX bibliography metadata.
public typealias BibtexMetadata = RustBridge.BibtexMetadata
/// Citation file metadata (RIS, PubMed, EndNote).
public struct CitationMetadata: Codable, Sendable, Hashable {
public let citationCount: UInt
public let format: String?
public let authors: [String]
public let yearRange: YearRange?
public let dois: [String]
public let keywords: [String]
public init(citationCount: UInt, format: String? = nil, authors: [String], yearRange: YearRange? = nil, dois: [String], keywords: [String]) {
self.citationCount = citationCount
self.format = format
self.authors = authors
self.yearRange = yearRange
self.dois = dois
self.keywords = keywords
}
private enum CodingKeys: String, CodingKey {
case citationCount = "citation_count"
case format = "format"
case authors = "authors"
case yearRange = "year_range"
case dois = "dois"
case keywords = "keywords"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.citationCount = try container.decodeIfPresent(UInt.self, forKey: .citationCount) ?? 0
self.format = try container.decodeIfPresent(String.self, forKey: .format) ?? nil
self.authors = try container.decodeIfPresent([String].self, forKey: .authors) ?? []
self.yearRange = try container.decodeIfPresent(YearRange.self, forKey: .yearRange) ?? nil
self.dois = try container.decodeIfPresent([String].self, forKey: .dois) ?? []
self.keywords = try container.decodeIfPresent([String].self, forKey: .keywords) ?? []
}
}
// MARK: - Internal FFI conversions for CitationMetadata
internal extension CitationMetadata {
init(_ rb: RustBridge.CitationMetadataRef) throws {
self.citationCount = rb.citationCount()
self.format = rb.format()?.toString()
self.authors = rb.authors().map { $0.as_str().toString() }
self.yearRange = try rb.yearRange().map { try YearRange($0) }
self.dois = rb.dois().map { $0.as_str().toString() }
self.keywords = rb.keywords().map { $0.as_str().toString() }
}
func intoRust() throws -> RustBridge.CitationMetadata {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.citationMetadataFromJson(json)
}
}
/// Year range for bibliographic metadata.
public struct YearRange: Codable, Sendable, Hashable {
public let min: UInt32?
public let max: UInt32?
public let years: [UInt32]
public init(min: UInt32? = nil, max: UInt32? = nil, years: [UInt32]) {
self.min = min
self.max = max
self.years = years
}
}
// MARK: - Internal FFI conversions for YearRange
internal extension YearRange {
init(_ rb: RustBridge.YearRangeRef) throws {
self.min = rb.min()
self.max = rb.max()
self.years = Array(rb.years())
}
func intoRust() throws -> RustBridge.YearRange {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.yearRangeFromJson(json)
}
}
/// FictionBook (FB2) metadata.
public struct FictionBookMetadata: Codable, Sendable, Hashable {
public let genres: [String]
public let sequences: [String]
public let annotation: String?
public init(genres: [String], sequences: [String], annotation: String? = nil) {
self.genres = genres
self.sequences = sequences
self.annotation = annotation
}
private enum CodingKeys: String, CodingKey {
case genres = "genres"
case sequences = "sequences"
case annotation = "annotation"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.genres = try container.decodeIfPresent([String].self, forKey: .genres) ?? []
self.sequences = try container.decodeIfPresent([String].self, forKey: .sequences) ?? []
self.annotation = try container.decodeIfPresent(String.self, forKey: .annotation) ?? nil
}
}
// MARK: - Internal FFI conversions for FictionBookMetadata
internal extension FictionBookMetadata {
init(_ rb: RustBridge.FictionBookMetadataRef) throws {
self.genres = rb.genres().map { $0.as_str().toString() }
self.sequences = rb.sequences().map { $0.as_str().toString() }
self.annotation = rb.annotation()?.toString()
}
func intoRust() throws -> RustBridge.FictionBookMetadata {
let __genres = RustVec<RustString>()
for __elem in self.genres { __genres.push(value: RustString(__elem)) }
let __sequences = RustVec<RustString>()
for __elem in self.sequences { __sequences.push(value: RustString(__elem)) }
return RustBridge.FictionBookMetadata(__genres, __sequences, self.annotation.map(RustString.init))
}
}
/// dBASE (DBF) file metadata.
public struct DbfMetadata: Codable, Sendable, Hashable {
public let recordCount: UInt
public let fieldCount: UInt
public let fields: [DbfFieldInfo]
public init(recordCount: UInt, fieldCount: UInt, fields: [DbfFieldInfo]) {
self.recordCount = recordCount
self.fieldCount = fieldCount
self.fields = fields
}
private enum CodingKeys: String, CodingKey {
case recordCount = "record_count"
case fieldCount = "field_count"
case fields = "fields"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.recordCount = try container.decodeIfPresent(UInt.self, forKey: .recordCount) ?? 0
self.fieldCount = try container.decodeIfPresent(UInt.self, forKey: .fieldCount) ?? 0
self.fields = try container.decodeIfPresent([DbfFieldInfo].self, forKey: .fields) ?? []
}
}
// MARK: - Internal FFI conversions for DbfMetadata
internal extension DbfMetadata {
init(_ rb: RustBridge.DbfMetadataRef) throws {
self.recordCount = rb.recordCount()
self.fieldCount = rb.fieldCount()
self.fields = try rb.fields().map { try DbfFieldInfo($0) }
}
func intoRust() throws -> RustBridge.DbfMetadata {
let __fields = RustVec<RustBridge.DbfFieldInfo>()
for __elem in self.fields { __fields.push(value: try __elem.intoRust()) }
return RustBridge.DbfMetadata(self.recordCount, self.fieldCount, __fields)
}
}
/// dBASE field information.
public struct DbfFieldInfo: Codable, Sendable, Hashable {
public let name: String
public let fieldType: String
public init(name: String, fieldType: String) {
self.name = name
self.fieldType = fieldType
}
private enum CodingKeys: String, CodingKey {
case name = "name"
case fieldType = "field_type"
}
}
// MARK: - Internal FFI conversions for DbfFieldInfo
internal extension DbfFieldInfo {
init(_ rb: RustBridge.DbfFieldInfoRef) throws {
self.name = rb.name().toString()
self.fieldType = rb.fieldType().toString()
}
func intoRust() throws -> RustBridge.DbfFieldInfo {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.dbfFieldInfoFromJson(json)
}
}
/// JATS (Journal Article Tag Suite) metadata.
public typealias JatsMetadata = RustBridge.JatsMetadata
/// JATS contributor with role.
public struct ContributorRole: Codable, Sendable, Hashable {
public let name: String
public let role: String?
public init(name: String, role: String? = nil) {
self.name = name
self.role = role
}
}
// MARK: - Internal FFI conversions for ContributorRole
internal extension ContributorRole {
init(_ rb: RustBridge.ContributorRoleRef) throws {
self.name = rb.name().toString()
self.role = rb.role()?.toString()
}
func intoRust() throws -> RustBridge.ContributorRole {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.contributorRoleFromJson(json)
}
}
/// EPUB metadata (Dublin Core extensions).
public struct EpubMetadata: Codable, Sendable, Hashable {
public let coverage: String?
public let dcFormat: String?
public let relation: String?
public let source: String?
public let dcType: String?
public let coverImage: String?
public init(coverage: String? = nil, dcFormat: String? = nil, relation: String? = nil, source: String? = nil, dcType: String? = nil, coverImage: String? = nil) {
self.coverage = coverage
self.dcFormat = dcFormat
self.relation = relation
self.source = source
self.dcType = dcType
self.coverImage = coverImage
}
private enum CodingKeys: String, CodingKey {
case coverage = "coverage"
case dcFormat = "dc_format"
case relation = "relation"
case source = "source"
case dcType = "dc_type"
case coverImage = "cover_image"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.coverage = try container.decodeIfPresent(String.self, forKey: .coverage) ?? nil
self.dcFormat = try container.decodeIfPresent(String.self, forKey: .dcFormat) ?? nil
self.relation = try container.decodeIfPresent(String.self, forKey: .relation) ?? nil
self.source = try container.decodeIfPresent(String.self, forKey: .source) ?? nil
self.dcType = try container.decodeIfPresent(String.self, forKey: .dcType) ?? nil
self.coverImage = try container.decodeIfPresent(String.self, forKey: .coverImage) ?? nil
}
}
// MARK: - Internal FFI conversions for EpubMetadata
internal extension EpubMetadata {
init(_ rb: RustBridge.EpubMetadataRef) throws {
self.coverage = rb.coverage()?.toString()
self.dcFormat = rb.dcFormat()?.toString()
self.relation = rb.relation()?.toString()
self.source = rb.source()?.toString()
self.dcType = rb.dcType()?.toString()
self.coverImage = rb.coverImage()?.toString()
}
func intoRust() throws -> RustBridge.EpubMetadata {
return RustBridge.EpubMetadata(self.coverage.map(RustString.init), self.dcFormat.map(RustString.init), self.relation.map(RustString.init), self.source.map(RustString.init), self.dcType.map(RustString.init), self.coverImage.map(RustString.init))
}
}
/// Outlook PST archive metadata.
public struct PstMetadata: Codable, Sendable, Hashable {
public let messageCount: UInt
public init(messageCount: UInt) {
self.messageCount = messageCount
}
private enum CodingKeys: String, CodingKey {
case messageCount = "message_count"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.messageCount = try container.decodeIfPresent(UInt.self, forKey: .messageCount) ?? 0
}
}
// MARK: - Internal FFI conversions for PstMetadata
internal extension PstMetadata {
init(_ rb: RustBridge.PstMetadataRef) throws {
self.messageCount = rb.messageCount()
}
func intoRust() throws -> RustBridge.PstMetadata {
return RustBridge.PstMetadata(self.messageCount)
}
}
/// Confidence scores for an OCR element.
///
/// Separates detection confidence (how confident that text exists at this location)
/// from recognition confidence (how confident about the actual text content).
public struct OcrConfidence: Codable, Sendable, Hashable {
/// Detection confidence: how confident the OCR engine is that text exists here.
///
/// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
/// Range: 0.0 to 1.0 (or None if not available).
public let detection: Double?
/// Recognition confidence: how confident about the text content.
///
/// Range: 0.0 to 1.0.
public let recognition: Double
public init(detection: Double? = nil, recognition: Double) {
self.detection = detection
self.recognition = recognition
}
private enum CodingKeys: String, CodingKey {
case detection = "detection"
case recognition = "recognition"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.detection = try container.decodeIfPresent(Double.self, forKey: .detection) ?? nil
self.recognition = try container.decodeIfPresent(Double.self, forKey: .recognition) ?? 0
}
}
// MARK: - Internal FFI conversions for OcrConfidence
internal extension OcrConfidence {
init(_ rb: RustBridge.OcrConfidenceRef) throws {
self.detection = rb.detection()
self.recognition = rb.recognition()
}
func intoRust() throws -> RustBridge.OcrConfidence {
return RustBridge.OcrConfidence(self.detection, self.recognition)
}
}
/// Rotation information for an OCR element.
public struct OcrRotation: Codable, Sendable, Hashable {
/// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
public let angleDegrees: Double
/// Confidence score for the rotation detection.
public let confidence: Double?
public init(angleDegrees: Double, confidence: Double? = nil) {
self.angleDegrees = angleDegrees
self.confidence = confidence
}
private enum CodingKeys: String, CodingKey {
case angleDegrees = "angle_degrees"
case confidence = "confidence"
}
}
// MARK: - Internal FFI conversions for OcrRotation
internal extension OcrRotation {
init(_ rb: RustBridge.OcrRotationRef) throws {
self.angleDegrees = rb.angleDegrees()
self.confidence = rb.confidence()
}
func intoRust() throws -> RustBridge.OcrRotation {
return RustBridge.OcrRotation(self.angleDegrees, self.confidence)
}
}
/// A unified OCR element representing detected text with full metadata.
///
/// This is the primary type for structured OCR output, preserving all information
/// from both Tesseract and PaddleOCR backends.
public typealias OcrElement = RustBridge.OcrElement
/// Configuration for OCR element extraction.
///
/// Controls how OCR elements are extracted and filtered.
public struct OcrElementConfig: Codable, Sendable, Hashable {
/// Whether to include OCR elements in the extraction result.
///
/// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
public let includeElements: Bool
/// Minimum hierarchical level to include.
///
/// Elements below this level (e.g., words when min_level is Line) will be excluded.
public let minLevel: OcrElementLevel
/// Minimum recognition confidence threshold (0.0-1.0).
///
/// Elements with confidence below this threshold will be filtered out.
public let minConfidence: Double
/// Whether to build hierarchical relationships between elements.
///
/// When true, `parent_id` fields will be populated based on spatial containment.
/// Only meaningful for Tesseract output.
public let buildHierarchy: Bool
public init(includeElements: Bool, minLevel: OcrElementLevel, minConfidence: Double, buildHierarchy: Bool) {
self.includeElements = includeElements
self.minLevel = minLevel
self.minConfidence = minConfidence
self.buildHierarchy = buildHierarchy
}
private enum CodingKeys: String, CodingKey {
case includeElements = "include_elements"
case minLevel = "min_level"
case minConfidence = "min_confidence"
case buildHierarchy = "build_hierarchy"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.includeElements = try container.decodeIfPresent(Bool.self, forKey: .includeElements) ?? false
self.minLevel = try container.decode(OcrElementLevel.self, forKey: .minLevel)
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0
self.buildHierarchy = try container.decodeIfPresent(Bool.self, forKey: .buildHierarchy) ?? false
}
}
// MARK: - Internal FFI conversions for OcrElementConfig
internal extension OcrElementConfig {
init(_ rb: RustBridge.OcrElementConfigRef) throws {
self.includeElements = rb.includeElements()
self.minLevel = OcrElementLevel(rawValue: rb.minLevel().toString()) ?? { fatalError("Unknown OcrElementLevel: \(rb.minLevel().toString())") }()
self.minConfidence = rb.minConfidence()
self.buildHierarchy = rb.buildHierarchy()
}
func intoRust() throws -> RustBridge.OcrElementConfig {
return RustBridge.OcrElementConfig(self.includeElements, try self.minLevel.intoRust(), self.minConfidence, self.buildHierarchy)
}
}
/// Unified page structure for documents.
///
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
/// with character offset boundaries for chunk-to-page mapping.
public struct PageStructure: Codable, Sendable, Hashable {
/// Total number of pages/slides/sheets
public let totalCount: UInt32
/// Type of paginated unit
public let unitType: PageUnitType
/// Character offset boundaries for each page
///
/// Maps character ranges in the extracted content to page numbers.
/// Used for chunk page range calculation.
public let boundaries: [PageBoundary]?
/// Detailed per-page metadata (optional, only when needed)
public let pages: [PageInfo]?
public init(totalCount: UInt32, unitType: PageUnitType, boundaries: [PageBoundary]? = nil, pages: [PageInfo]? = nil) {
self.totalCount = totalCount
self.unitType = unitType
self.boundaries = boundaries
self.pages = pages
}
private enum CodingKeys: String, CodingKey {
case totalCount = "total_count"
case unitType = "unit_type"
case boundaries = "boundaries"
case pages = "pages"
}
}
// MARK: - Internal FFI conversions for PageStructure
internal extension PageStructure {
init(_ rb: RustBridge.PageStructureRef) throws {
self.totalCount = rb.totalCount()
self.unitType = PageUnitType(rawValue: rb.unitType().toString()) ?? { fatalError("Unknown PageUnitType: \(rb.unitType().toString())") }()
self.boundaries = try rb.boundaries()?.map { try PageBoundary($0) }
self.pages = try rb.pages()?.map { try PageInfo($0) }
}
func intoRust() throws -> RustBridge.PageStructure {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pageStructureFromJson(json)
}
}
/// Byte offset boundary for a page.
///
/// Tracks where a specific page's content starts and ends in the main content string,
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
public struct PageBoundary: Codable, Sendable, Hashable {
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
public let byteStart: UInt
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
public let byteEnd: UInt
/// Page number (1-indexed)
public let pageNumber: UInt32
public init(byteStart: UInt, byteEnd: UInt, pageNumber: UInt32) {
self.byteStart = byteStart
self.byteEnd = byteEnd
self.pageNumber = pageNumber
}
private enum CodingKeys: String, CodingKey {
case byteStart = "byte_start"
case byteEnd = "byte_end"
case pageNumber = "page_number"
}
}
// MARK: - Internal FFI conversions for PageBoundary
internal extension PageBoundary {
init(_ rb: RustBridge.PageBoundaryRef) throws {
self.byteStart = rb.byteStart()
self.byteEnd = rb.byteEnd()
self.pageNumber = rb.pageNumber()
}
func intoRust() throws -> RustBridge.PageBoundary {
return RustBridge.PageBoundary(self.byteStart, self.byteEnd, self.pageNumber)
}
}
/// Metadata for individual page/slide/sheet.
///
/// Captures per-page information including dimensions, content counts,
/// and visibility state (for presentations).
public struct PageInfo: Codable, Sendable, Hashable {
/// Page number (1-indexed)
public let number: UInt32
/// Page title (usually for presentations)
public let title: String?
/// Dimensions in points (PDF) or pixels (images): (width, height)
public let dimensions: [Double]?
/// Number of images on this page
public let imageCount: UInt32?
/// Number of tables on this page
public let tableCount: UInt32?
/// Whether this page is hidden (e.g., in presentations)
public let hidden: Bool?
/// Whether this page is blank (no meaningful text, no images, no tables)
///
/// A page is considered blank if it has fewer than 3 non-whitespace characters
/// and contains no tables or images. This is useful for filtering out empty pages
/// in scanned documents or PDFs with blank separator pages.
public let isBlank: Bool?
/// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
///
/// Indicates the presence of vector-drawn content such as charts, diagrams,
/// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
/// invisible to `ExtractionResult.images` since they are not embedded as raster
/// XObjects. Set to `true` when path count exceeds a heuristic threshold,
/// signaling that downstream consumers may want to rasterize the page to
/// capture this content.
///
/// Only populated for PDFs; `None` for other document types.
public let hasVectorGraphics: Bool
public init(number: UInt32, title: String? = nil, dimensions: [Double]? = nil, imageCount: UInt32? = nil, tableCount: UInt32? = nil, hidden: Bool? = nil, isBlank: Bool? = nil, hasVectorGraphics: Bool) {
self.number = number
self.title = title
self.dimensions = dimensions
self.imageCount = imageCount
self.tableCount = tableCount
self.hidden = hidden
self.isBlank = isBlank
self.hasVectorGraphics = hasVectorGraphics
}
private enum CodingKeys: String, CodingKey {
case number = "number"
case title = "title"
case dimensions = "dimensions"
case imageCount = "image_count"
case tableCount = "table_count"
case hidden = "hidden"
case isBlank = "is_blank"
case hasVectorGraphics = "has_vector_graphics"
}
}
// MARK: - Internal FFI conversions for PageInfo
internal extension PageInfo {
init(_ rb: RustBridge.PageInfoRef) throws {
self.number = rb.number()
self.title = rb.title()?.toString()
self.dimensions = rb.dimensions().map { Array($0) }
self.imageCount = rb.imageCount()
self.tableCount = rb.tableCount()
self.hidden = rb.hidden()
self.isBlank = rb.isBlank()
self.hasVectorGraphics = rb.hasVectorGraphics()
}
func intoRust() throws -> RustBridge.PageInfo {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pageInfoFromJson(json)
}
}
/// Content for a single page/slide.
///
/// When page extraction is enabled, documents are split into per-page content
/// with associated tables and images mapped to each page.
///
/// # Performance
///
/// Uses Arc-wrapped tables and images for memory efficiency:
/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
///
/// This reduces memory overhead for documents with shared tables/images
/// by avoiding redundant copies during serialization.
public struct PageContent: Codable, Sendable, Hashable {
/// Page number (1-indexed)
public let pageNumber: UInt32
/// Text content for this page
public let content: String
/// Tables found on this page (uses Arc for memory efficiency)
///
/// Serializes as Vec<Table> for JSON compatibility while maintaining
/// Arc semantics in-memory for zero-copy sharing.
public let tables: [Table]
/// Indices into `ExtractionResult.images` for images found on this page.
///
/// Each value is a zero-based index into the top-level `images` collection.
/// Only populated when `extract_images = true` in the extraction config.
public let imageIndices: [UInt32]
/// Hierarchy information for the page (when hierarchy extraction is enabled)
///
/// Contains text hierarchy levels (H1-H6) extracted from the page content.
public let hierarchy: PageHierarchy?
/// Whether this page is blank (no meaningful text content)
///
/// Determined during extraction based on text content analysis.
/// A page is blank if it has fewer than 3 non-whitespace characters
/// and contains no tables or images.
public let isBlank: Bool?
/// Layout detection regions for this page (when layout detection is enabled).
///
/// Contains detected layout regions with class, confidence, bounding box,
/// and area fraction. Only populated when layout detection is configured.
public let layoutRegions: [LayoutRegion]?
/// Speaker notes for this slide (PPTX only).
///
/// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
/// Only populated when the source is a PPTX file and notes are present.
public let speakerNotes: String?
/// Section name this slide belongs to (PPTX only).
///
/// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
/// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
/// the slide belongs to a named section.
public let sectionName: String?
/// Sheet name for this page (XLSX/ODS only).
///
/// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
/// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
/// formats and for sheets with an empty name.
public let sheetName: String?
public init(pageNumber: UInt32, content: String, tables: [Table], imageIndices: [UInt32], hierarchy: PageHierarchy? = nil, isBlank: Bool? = nil, layoutRegions: [LayoutRegion]? = nil, speakerNotes: String? = nil, sectionName: String? = nil, sheetName: String? = nil) {
self.pageNumber = pageNumber
self.content = content
self.tables = tables
self.imageIndices = imageIndices
self.hierarchy = hierarchy
self.isBlank = isBlank
self.layoutRegions = layoutRegions
self.speakerNotes = speakerNotes
self.sectionName = sectionName
self.sheetName = sheetName
}
private enum CodingKeys: String, CodingKey {
case pageNumber = "page_number"
case content = "content"
case tables = "tables"
case imageIndices = "image_indices"
case hierarchy = "hierarchy"
case isBlank = "is_blank"
case layoutRegions = "layout_regions"
case speakerNotes = "speaker_notes"
case sectionName = "section_name"
case sheetName = "sheet_name"
}
}
// MARK: - Internal FFI conversions for PageContent
internal extension PageContent {
init(_ rb: RustBridge.PageContentRef) throws {
self.pageNumber = rb.pageNumber()
self.content = rb.content().toString()
self.tables = try rb.tables().map { try Table($0) }
self.imageIndices = Array(rb.imageIndices())
self.hierarchy = try rb.hierarchy().map { try PageHierarchy($0) }
self.isBlank = rb.isBlank()
self.layoutRegions = try rb.layoutRegions()?.map { try LayoutRegion($0) }
self.speakerNotes = rb.speakerNotes()?.toString()
self.sectionName = rb.sectionName()?.toString()
self.sheetName = rb.sheetName()?.toString()
}
func intoRust() throws -> RustBridge.PageContent {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pageContentFromJson(json)
}
}
/// A detected layout region on a page.
///
/// When layout detection is enabled, each page may have layout regions
/// identifying different content types (text, pictures, tables, etc.)
/// with confidence scores and spatial positions.
public struct LayoutRegion: Codable, Sendable, Hashable {
/// Layout class name (e.g. "picture", "table", "text", "section_header").
public let className: String
/// Confidence score from the layout detection model (0.0 to 1.0).
public let confidence: Double
/// Bounding box in document coordinate space.
public let boundingBox: BoundingBox
/// Fraction of the page area covered by this region (0.0 to 1.0).
public let areaFraction: Double
public init(className: String, confidence: Double, boundingBox: BoundingBox, areaFraction: Double) {
self.className = className
self.confidence = confidence
self.boundingBox = boundingBox
self.areaFraction = areaFraction
}
private enum CodingKeys: String, CodingKey {
case className = "class_name"
case confidence = "confidence"
case boundingBox = "bounding_box"
case areaFraction = "area_fraction"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.className = try container.decodeIfPresent(String.self, forKey: .className) ?? ""
self.confidence = try container.decodeIfPresent(Double.self, forKey: .confidence) ?? 0
self.boundingBox = try container.decode(BoundingBox.self, forKey: .boundingBox)
self.areaFraction = try container.decodeIfPresent(Double.self, forKey: .areaFraction) ?? 0
}
}
// MARK: - Internal FFI conversions for LayoutRegion
internal extension LayoutRegion {
init(_ rb: RustBridge.LayoutRegionRef) throws {
self.className = rb.className().toString()
self.confidence = rb.confidence()
self.boundingBox = try BoundingBox(rb.boundingBox())
self.areaFraction = rb.areaFraction()
}
func intoRust() throws -> RustBridge.LayoutRegion {
return RustBridge.LayoutRegion(RustString(self.className), self.confidence, try self.boundingBox.intoRust(), self.areaFraction)
}
}
/// Page hierarchy structure containing heading levels and block information.
///
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
/// blocks with heading levels (H1-H6) for semantic document structure.
public struct PageHierarchy: Codable, Sendable, Hashable {
/// Number of hierarchy blocks on this page
public let blockCount: UInt32
/// Hierarchical blocks with heading levels
public let blocks: [HierarchicalBlock]
public init(blockCount: UInt32, blocks: [HierarchicalBlock]) {
self.blockCount = blockCount
self.blocks = blocks
}
private enum CodingKeys: String, CodingKey {
case blockCount = "block_count"
case blocks = "blocks"
}
}
// MARK: - Internal FFI conversions for PageHierarchy
internal extension PageHierarchy {
init(_ rb: RustBridge.PageHierarchyRef) throws {
self.blockCount = rb.blockCount()
self.blocks = try rb.blocks().map { try HierarchicalBlock($0) }
}
func intoRust() throws -> RustBridge.PageHierarchy {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.pageHierarchyFromJson(json)
}
}
/// A text block with hierarchy level assignment.
///
/// Represents a block of text with semantic heading information extracted from
/// font size clustering and hierarchical analysis.
public struct HierarchicalBlock: Codable, Sendable, Hashable {
/// The text content of this block
public let text: String
/// The font size of the text in this block
public let fontSize: Float
/// The hierarchy level of this block (H1-H6 or Body)
///
/// Levels correspond to HTML heading tags:
/// - "h1": Top-level heading
/// - "h2": Secondary heading
/// - "h3": Tertiary heading
/// - "h4": Quaternary heading
/// - "h5": Quinary heading
/// - "h6": Senary heading
/// - "body": Body text (no heading level)
public let level: String
/// Bounding box information for the block
///
/// Contains coordinates as (left, top, right, bottom) in PDF units.
public let bbox: [Float]?
public init(text: String, fontSize: Float, level: String, bbox: [Float]? = nil) {
self.text = text
self.fontSize = fontSize
self.level = level
self.bbox = bbox
}
private enum CodingKeys: String, CodingKey {
case text = "text"
case fontSize = "font_size"
case level = "level"
case bbox = "bbox"
}
}
// MARK: - Internal FFI conversions for HierarchicalBlock
internal extension HierarchicalBlock {
init(_ rb: RustBridge.HierarchicalBlockRef) throws {
self.text = rb.text().toString()
self.fontSize = rb.fontSize()
self.level = rb.level().toString()
self.bbox = rb.bbox().map { Array($0) }
}
func intoRust() throws -> RustBridge.HierarchicalBlock {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.hierarchicalBlockFromJson(json)
}
}
/// A single changed cell within a table.
///
/// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate::diff` re-exports this type verbatim.
public struct CellChange: Codable, Sendable, Hashable {
/// Zero-based row index.
public let row: UInt
/// Zero-based column index.
public let col: UInt
/// Value before the change.
public let from: String
/// Value after the change.
public let to: String
public init(row: UInt, col: UInt, from: String, to: String) {
self.row = row
self.col = col
self.from = from
self.to = to
}
}
// MARK: - Internal FFI conversions for CellChange
internal extension CellChange {
init(_ rb: RustBridge.CellChangeRef) throws {
self.row = rb.row()
self.col = rb.col()
self.from = rb.from().toString()
self.to = rb.to().toString()
}
func intoRust() throws -> RustBridge.CellChange {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.cellChangeFromJson(json)
}
}
/// A single tracked change embedded in a document.
///
/// Populated by per-format extractors that understand change-tracking metadata
/// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, ). Every
/// extractor defaults to `ExtractionResult.revisions = None` until a
/// format-specific implementation is added.
public struct DocumentRevision: Codable, Sendable, Hashable {
/// Format-specific revision identifier.
///
/// For DOCX this is the `w:id` attribute value on the change element
/// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
/// generated (`"docx-ins-0"`, `"docx-del-3"`, ).
public let revisionId: String
/// Display name of the author who made this change, when available.
public let author: String?
/// ISO-8601 timestamp of the change, when available.
///
/// Stored as a plain string so this type remains FFI-friendly and
/// unconditionally available without the `chrono` optional dep.
/// DOCX populates this from the `w:date` attribute (e.g.
/// `"2024-03-15T10:30:00Z"`).
public let timestamp: String?
/// Semantic kind of this revision.
public let kind: RevisionKind
/// Best-effort document location for this revision.
///
/// Resolution is format-dependent and may be `None` when the location
/// cannot be determined (e.g. changes inside table cells before
/// table-cell anchor support is added).
public let anchor: RevisionAnchor?
/// The content changes that make up this revision.
public let delta: RevisionDelta
public init(revisionId: String, author: String? = nil, timestamp: String? = nil, kind: RevisionKind, anchor: RevisionAnchor? = nil, delta: RevisionDelta) {
self.revisionId = revisionId
self.author = author
self.timestamp = timestamp
self.kind = kind
self.anchor = anchor
self.delta = delta
}
private enum CodingKeys: String, CodingKey {
case revisionId = "revision_id"
case author = "author"
case timestamp = "timestamp"
case kind = "kind"
case anchor = "anchor"
case delta = "delta"
}
}
// MARK: - Internal FFI conversions for DocumentRevision
internal extension DocumentRevision {
init(_ rb: RustBridge.DocumentRevisionRef) throws {
self.revisionId = rb.revisionId().toString()
self.author = rb.author()?.toString()
self.timestamp = rb.timestamp()?.toString()
self.kind = RevisionKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown RevisionKind: \(rb.kind().toString())") }()
self.anchor = try JSONDecoder().decode(RevisionAnchor?.self, from: ((rb.anchor()?.toString() ?? "null").data(using: .utf8) ?? Data("null".utf8)))
self.delta = try RevisionDelta(rb.delta())
}
func intoRust() throws -> RustBridge.DocumentRevision {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.documentRevisionFromJson(json)
}
}
/// The content changes that make up a single revision.
///
/// For insertions and deletions the `content` field carries the added/removed
/// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
/// changes, `content` is empty the property diff is left as a TODO for a
/// later enrichment pass.
public struct RevisionDelta: Codable, Sendable, Hashable {
/// Line-level content changes for this revision.
public let content: [DiffLine]
/// Cell-level table changes for this revision.
public let tableChanges: [CellChange]
public init(content: [DiffLine], tableChanges: [CellChange]) {
self.content = content
self.tableChanges = tableChanges
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case tableChanges = "table_changes"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.content = try container.decodeIfPresent([DiffLine].self, forKey: .content) ?? []
self.tableChanges = try container.decodeIfPresent([CellChange].self, forKey: .tableChanges) ?? []
}
}
// MARK: - Internal FFI conversions for RevisionDelta
internal extension RevisionDelta {
init(_ rb: RustBridge.RevisionDeltaRef) throws {
self.content = try rb.content().map { (s: RustStringRef) -> DiffLine in let d = s.as_str().toString().data(using: .utf8) ?? Data(); return try JSONDecoder().decode(DiffLine.self, from: d) }
self.tableChanges = try rb.tableChanges().map { try CellChange($0) }
}
func intoRust() throws -> RustBridge.RevisionDelta {
let __content = RustVec<RustBridge.DiffLine>()
for __elem in self.content { __content.push(value: try __elem.intoRust()) }
let __tableChanges = RustVec<RustBridge.CellChange>()
for __elem in self.tableChanges { __tableChanges.push(value: try __elem.intoRust()) }
return RustBridge.RevisionDelta(__content, __tableChanges)
}
}
/// Extracted table structure.
///
/// Represents a table detected and extracted from a document (PDF, image, etc.).
/// Tables are converted to both structured cell data and Markdown format.
public struct Table: Codable, Sendable, Hashable {
/// Table cells as a 2D vector (rows × columns)
public let cells: [[String]]
/// Markdown representation of the table
public let markdown: String
/// Page number where the table was found (1-indexed)
public let pageNumber: UInt32
/// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
/// Only populated for PDF-extracted tables when position data is available.
public let boundingBox: BoundingBox?
public init(cells: [[String]], markdown: String, pageNumber: UInt32, boundingBox: BoundingBox? = nil) {
self.cells = cells
self.markdown = markdown
self.pageNumber = pageNumber
self.boundingBox = boundingBox
}
private enum CodingKeys: String, CodingKey {
case cells = "cells"
case markdown = "markdown"
case pageNumber = "page_number"
case boundingBox = "bounding_box"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.cells = try container.decodeIfPresent([[String]].self, forKey: .cells) ?? []
self.markdown = try container.decodeIfPresent(String.self, forKey: .markdown) ?? ""
self.pageNumber = try container.decodeIfPresent(UInt32.self, forKey: .pageNumber) ?? 0
self.boundingBox = try container.decodeIfPresent(BoundingBox.self, forKey: .boundingBox) ?? nil
}
}
// MARK: - Internal FFI conversions for Table
internal extension Table {
init(_ rb: RustBridge.TableRef) throws {
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
self.markdown = rb.markdown().toString()
self.pageNumber = rb.pageNumber()
self.boundingBox = try rb.boundingBox().map { try BoundingBox($0) }
}
func intoRust() throws -> RustBridge.Table {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.tableFromJson(json)
}
}
/// Individual table cell with content and optional styling.
///
/// Future extension point for rich table support with cell-level metadata.
public struct TableCell: Codable, Sendable, Hashable {
/// Cell content as text
public let content: String
/// Row span (number of rows this cell spans)
public let rowSpan: UInt32
/// Column span (number of columns this cell spans)
public let colSpan: UInt32
/// Whether this is a header cell
public let isHeader: Bool
public init(content: String, rowSpan: UInt32, colSpan: UInt32, isHeader: Bool) {
self.content = content
self.rowSpan = rowSpan
self.colSpan = colSpan
self.isHeader = isHeader
}
private enum CodingKeys: String, CodingKey {
case content = "content"
case rowSpan = "row_span"
case colSpan = "col_span"
case isHeader = "is_header"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.content = try container.decodeIfPresent(String.self, forKey: .content) ?? ""
self.rowSpan = try container.decodeIfPresent(UInt32.self, forKey: .rowSpan) ?? 0
self.colSpan = try container.decodeIfPresent(UInt32.self, forKey: .colSpan) ?? 0
self.isHeader = try container.decodeIfPresent(Bool.self, forKey: .isHeader) ?? false
}
}
// MARK: - Internal FFI conversions for TableCell
internal extension TableCell {
init(_ rb: RustBridge.TableCellRef) throws {
self.content = rb.content().toString()
self.rowSpan = rb.rowSpan()
self.colSpan = rb.colSpan()
self.isHeader = rb.isHeader()
}
func intoRust() throws -> RustBridge.TableCell {
return RustBridge.TableCell(RustString(self.content), self.rowSpan, self.colSpan, self.isHeader)
}
}
/// A URI extracted from a document.
///
/// Represents any link, reference, or resource pointer found during extraction.
/// The `kind` field classifies the URI semantically, while `label` carries
/// optional human-readable display text.
public struct ExtractedUri: Codable, Sendable, Hashable {
/// The URL or path string.
public let url: String
/// Optional display text / label for the link.
public let label: String?
/// Optional page number where the URI was found (1-indexed).
public let page: UInt32?
/// Semantic classification of the URI.
public let kind: UriKind
public init(url: String, label: String? = nil, page: UInt32? = nil, kind: UriKind) {
self.url = url
self.label = label
self.page = page
self.kind = kind
}
}
// MARK: - Internal FFI conversions for ExtractedUri
internal extension ExtractedUri {
init(_ rb: RustBridge.ExtractedUriRef) throws {
self.url = rb.url().toString()
self.label = rb.label()?.toString()
self.page = rb.page()
self.kind = UriKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown UriKind: \(rb.kind().toString())") }()
}
func intoRust() throws -> RustBridge.ExtractedUri {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.extractedUriFromJson(json)
}
}
/// MIME type detection response.
public struct DetectResponse: Codable, Sendable, Hashable {
/// Detected MIME type
public let mimeType: String
/// Original filename (if provided)
public let filename: String?
public init(mimeType: String, filename: String? = nil) {
self.mimeType = mimeType
self.filename = filename
}
private enum CodingKeys: String, CodingKey {
case mimeType = "mime_type"
case filename = "filename"
}
}
// MARK: - Internal FFI conversions for DetectResponse
internal extension DetectResponse {
init(_ rb: RustBridge.DetectResponseRef) throws {
self.mimeType = rb.mimeType().toString()
self.filename = rb.filename()?.toString()
}
func intoRust() throws -> RustBridge.DetectResponse {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.detectResponseFromJson(json)
}
}
/// Options controlling how two `ExtractionResult` values are compared.
public struct DiffOptions: Codable, Sendable, Hashable {
/// Include metadata changes in the diff. Default: `true`.
public let includeMetadata: Bool
/// Include embedded-children changes in the diff. Default: `true`.
public let includeEmbedded: Bool
/// Truncate content to this many characters before diffing.
///
/// Useful for very large documents where only the first N characters matter.
/// `None` means no truncation.
public let maxContentChars: UInt?
public init(includeMetadata: Bool, includeEmbedded: Bool, maxContentChars: UInt? = nil) {
self.includeMetadata = includeMetadata
self.includeEmbedded = includeEmbedded
self.maxContentChars = maxContentChars
}
private enum CodingKeys: String, CodingKey {
case includeMetadata = "include_metadata"
case includeEmbedded = "include_embedded"
case maxContentChars = "max_content_chars"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.includeMetadata = try container.decodeIfPresent(Bool.self, forKey: .includeMetadata) ?? true
self.includeEmbedded = try container.decodeIfPresent(Bool.self, forKey: .includeEmbedded) ?? true
self.maxContentChars = try container.decodeIfPresent(UInt.self, forKey: .maxContentChars) ?? nil
}
}
// MARK: - Internal FFI conversions for DiffOptions
internal extension DiffOptions {
init(_ rb: RustBridge.DiffOptionsRef) throws {
self.includeMetadata = rb.includeMetadata()
self.includeEmbedded = rb.includeEmbedded()
self.maxContentChars = rb.maxContentChars()
}
func intoRust() throws -> RustBridge.DiffOptions {
return RustBridge.DiffOptions(self.includeMetadata, self.includeEmbedded, self.maxContentChars)
}
}
/// The complete diff between two `ExtractionResult` values.
public typealias ExtractionDiff = RustBridge.ExtractionDiff
/// A single contiguous hunk in a unified diff.
public struct DiffHunk: Codable, Sendable, Hashable {
/// Starting line number in the old content (0-indexed).
public let fromLine: UInt
/// Number of lines from the old content in this hunk.
public let fromCount: UInt
/// Starting line number in the new content (0-indexed).
public let toLine: UInt
/// Number of lines from the new content in this hunk.
public let toCount: UInt
/// Lines that make up this hunk.
public let lines: [DiffLine]
public init(fromLine: UInt, fromCount: UInt, toLine: UInt, toCount: UInt, lines: [DiffLine]) {
self.fromLine = fromLine
self.fromCount = fromCount
self.toLine = toLine
self.toCount = toCount
self.lines = lines
}
private enum CodingKeys: String, CodingKey {
case fromLine = "from_line"
case fromCount = "from_count"
case toLine = "to_line"
case toCount = "to_count"
case lines = "lines"
}
}
// MARK: - Internal FFI conversions for DiffHunk
internal extension DiffHunk {
init(_ rb: RustBridge.DiffHunkRef) throws {
self.fromLine = rb.fromLine()
self.fromCount = rb.fromCount()
self.toLine = rb.toLine()
self.toCount = rb.toCount()
self.lines = try rb.lines().map { (s: RustStringRef) -> DiffLine in let d = s.as_str().toString().data(using: .utf8) ?? Data(); return try JSONDecoder().decode(DiffLine.self, from: d) }
}
func intoRust() throws -> RustBridge.DiffHunk {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.diffHunkFromJson(json)
}
}
/// Cell-level changes for a pair of tables that share the same index.
public struct TableDiff: Codable, Sendable, Hashable {
/// Zero-based index of the table in both `a.tables` and `b.tables`.
public let fromIndex: UInt
/// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
public let toIndex: UInt
/// Cell-level changes within the table.
public let cellChanges: [CellChange]
public init(fromIndex: UInt, toIndex: UInt, cellChanges: [CellChange]) {
self.fromIndex = fromIndex
self.toIndex = toIndex
self.cellChanges = cellChanges
}
private enum CodingKeys: String, CodingKey {
case fromIndex = "from_index"
case toIndex = "to_index"
case cellChanges = "cell_changes"
}
}
// MARK: - Internal FFI conversions for TableDiff
internal extension TableDiff {
init(_ rb: RustBridge.TableDiffRef) throws {
self.fromIndex = rb.fromIndex()
self.toIndex = rb.toIndex()
self.cellChanges = try rb.cellChanges().map { try CellChange($0) }
}
func intoRust() throws -> RustBridge.TableDiff {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.tableDiffFromJson(json)
}
}
/// Changes to embedded archive children between two results.
public typealias EmbeddedChanges = RustBridge.EmbeddedChanges
/// Diff for a single embedded archive entry that appears in both results.
public typealias EmbeddedDiff = RustBridge.EmbeddedDiff
/// Preset configurations for common RAG use cases.
///
/// Each preset combines chunk size, overlap, and embedding model
/// to provide an optimized configuration for specific scenarios.
///
/// All string fields are owned `String` for FFI compatibility instances
/// are safe to clone and pass across language boundaries.
public struct EmbeddingPreset: Codable, Sendable, Hashable {
public let name: String
public let chunkSize: UInt
public let overlap: UInt
/// HuggingFace repository name for the model.
public let modelRepo: String
/// Pooling strategy: "cls" or "mean".
public let pooling: String
/// Path to the ONNX model file within the repo.
public let modelFile: String
public let dimensions: UInt
public let description: String
public init(name: String, chunkSize: UInt, overlap: UInt, modelRepo: String, pooling: String, modelFile: String, dimensions: UInt, description: String) {
self.name = name
self.chunkSize = chunkSize
self.overlap = overlap
self.modelRepo = modelRepo
self.pooling = pooling
self.modelFile = modelFile
self.dimensions = dimensions
self.description = description
}
private enum CodingKeys: String, CodingKey {
case name = "name"
case chunkSize = "chunk_size"
case overlap = "overlap"
case modelRepo = "model_repo"
case pooling = "pooling"
case modelFile = "model_file"
case dimensions = "dimensions"
case description = "description"
}
}
// MARK: - Internal FFI conversions for EmbeddingPreset
internal extension EmbeddingPreset {
init(_ rb: RustBridge.EmbeddingPresetRef) throws {
self.name = rb.name().toString()
self.chunkSize = rb.chunkSize()
self.overlap = rb.overlap()
self.modelRepo = rb.modelRepo().toString()
self.pooling = rb.pooling().toString()
self.modelFile = rb.modelFile().toString()
self.dimensions = rb.dimensions()
self.description = rb.description().toString()
}
func intoRust() throws -> RustBridge.EmbeddingPreset {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.embeddingPresetFromJson(json)
}
}
/// YAKE-specific parameters.
public struct YakeParams: Codable, Sendable, Hashable {
/// Window size for co-occurrence analysis (default: 2).
///
/// Controls the context window for computing co-occurrence statistics.
public let windowSize: UInt
public init(windowSize: UInt) {
self.windowSize = windowSize
}
private enum CodingKeys: String, CodingKey {
case windowSize = "window_size"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.windowSize = try container.decodeIfPresent(UInt.self, forKey: .windowSize) ?? 2
}
}
// MARK: - Internal FFI conversions for YakeParams
internal extension YakeParams {
init(_ rb: RustBridge.YakeParamsRef) throws {
self.windowSize = rb.windowSize()
}
func intoRust() throws -> RustBridge.YakeParams {
return RustBridge.YakeParams(self.windowSize)
}
}
/// RAKE-specific parameters.
public struct RakeParams: Codable, Sendable, Hashable {
/// Minimum word length to consider (default: 1).
public let minWordLength: UInt
/// Maximum words in a keyword phrase (default: 3).
public let maxWordsPerPhrase: UInt
public init(minWordLength: UInt, maxWordsPerPhrase: UInt) {
self.minWordLength = minWordLength
self.maxWordsPerPhrase = maxWordsPerPhrase
}
private enum CodingKeys: String, CodingKey {
case minWordLength = "min_word_length"
case maxWordsPerPhrase = "max_words_per_phrase"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.minWordLength = try container.decodeIfPresent(UInt.self, forKey: .minWordLength) ?? 1
self.maxWordsPerPhrase = try container.decodeIfPresent(UInt.self, forKey: .maxWordsPerPhrase) ?? 3
}
}
// MARK: - Internal FFI conversions for RakeParams
internal extension RakeParams {
init(_ rb: RustBridge.RakeParamsRef) throws {
self.minWordLength = rb.minWordLength()
self.maxWordsPerPhrase = rb.maxWordsPerPhrase()
}
func intoRust() throws -> RustBridge.RakeParams {
return RustBridge.RakeParams(self.minWordLength, self.maxWordsPerPhrase)
}
}
/// Keyword extraction configuration.
public struct KeywordConfig: Codable, Sendable, Hashable {
/// Algorithm to use for extraction.
public let algorithm: KeywordAlgorithm
/// Maximum number of keywords to extract (default: 10).
public let maxKeywords: UInt
/// Minimum score threshold (0.0-1.0, default: 0.0).
///
/// Keywords with scores below this threshold are filtered out.
/// Note: Score ranges differ between algorithms.
public let minScore: Float
/// N-gram range for keyword extraction (min, max).
///
/// (1, 1) = unigrams only
/// (1, 2) = unigrams and bigrams
/// (1, 3) = unigrams, bigrams, and trigrams (default)
public let ngramRange: [UInt]
/// Language code for stopword filtering (e.g., "en", "de", "fr").
///
/// If None, no stopword filtering is applied.
public let language: String?
/// YAKE-specific tuning parameters.
public let yakeParams: YakeParams?
/// RAKE-specific tuning parameters.
public let rakeParams: RakeParams?
public init(algorithm: KeywordAlgorithm, maxKeywords: UInt, minScore: Float, ngramRange: [UInt], language: String? = nil, yakeParams: YakeParams? = nil, rakeParams: RakeParams? = nil) {
self.algorithm = algorithm
self.maxKeywords = maxKeywords
self.minScore = minScore
self.ngramRange = ngramRange
self.language = language
self.yakeParams = yakeParams
self.rakeParams = rakeParams
}
private enum CodingKeys: String, CodingKey {
case algorithm = "algorithm"
case maxKeywords = "max_keywords"
case minScore = "min_score"
case ngramRange = "ngram_range"
case language = "language"
case yakeParams = "yake_params"
case rakeParams = "rake_params"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.algorithm = try container.decode(KeywordAlgorithm.self, forKey: .algorithm)
self.maxKeywords = try container.decodeIfPresent(UInt.self, forKey: .maxKeywords) ?? 10
self.minScore = try container.decodeIfPresent(Float.self, forKey: .minScore) ?? 0.0
self.ngramRange = try container.decodeIfPresent([UInt].self, forKey: .ngramRange) ?? []
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? nil
self.yakeParams = try container.decodeIfPresent(YakeParams.self, forKey: .yakeParams) ?? nil
self.rakeParams = try container.decodeIfPresent(RakeParams.self, forKey: .rakeParams) ?? nil
}
}
// MARK: - Internal FFI conversions for KeywordConfig
internal extension KeywordConfig {
init(_ rb: RustBridge.KeywordConfigRef) throws {
self.algorithm = KeywordAlgorithm(rawValue: rb.algorithm().toString()) ?? { fatalError("Unknown KeywordAlgorithm: \(rb.algorithm().toString())") }()
self.maxKeywords = rb.maxKeywords()
self.minScore = rb.minScore()
self.ngramRange = Array(rb.ngramRange())
self.language = rb.language()?.toString()
self.yakeParams = try rb.yakeParams().map { try YakeParams($0) }
self.rakeParams = try rb.rakeParams().map { try RakeParams($0) }
}
func intoRust() throws -> RustBridge.KeywordConfig {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.keywordConfigFromJson(json)
}
}
/// Extracted keyword with metadata.
public struct Keyword: Codable, Sendable, Hashable {
/// The keyword text.
public let text: String
/// Relevance score (higher is better, algorithm-specific range).
public let score: Float
/// Algorithm that extracted this keyword.
public let algorithm: KeywordAlgorithm
/// Optional positions where keyword appears in text (character offsets).
public let positions: [UInt]?
public init(text: String, score: Float, algorithm: KeywordAlgorithm, positions: [UInt]? = nil) {
self.text = text
self.score = score
self.algorithm = algorithm
self.positions = positions
}
}
// MARK: - Internal FFI conversions for Keyword
internal extension Keyword {
init(_ rb: RustBridge.KeywordRef) throws {
self.text = rb.text().toString()
self.score = rb.score()
self.algorithm = KeywordAlgorithm(rawValue: rb.algorithm().toString()) ?? { fatalError("Unknown KeywordAlgorithm: \(rb.algorithm().toString())") }()
self.positions = rb.positions().map { Array($0) }
}
func intoRust() throws -> RustBridge.Keyword {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.keywordFromJson(json)
}
}
/// Configuration for PaddleOCR backend.
///
/// Configures PaddleOCR text detection and recognition with multi-language support.
/// Uses a builder pattern for convenient configuration.
///
/// # Examples
///
/// ```no_run
/// use kreuzberg::PaddleOcrConfig;
///
/// // Create with default English configuration
/// let config = PaddleOcrConfig::new("en");
///
/// // Create with custom cache directory
/// let config = PaddleOcrConfig::new("ch")
/// .with_cache_dir("/path/to/cache".into());
///
/// // Enable table detection
/// let config = PaddleOcrConfig::new("en")
/// .with_table_detection(true);
/// ```
public typealias PaddleOcrConfig = RustBridge.PaddleOcrConfig
/// Combined paths to all models needed for OCR (backward compatibility).
public typealias ModelPaths = RustBridge.ModelPaths
/// Document orientation detection result.
public struct OrientationResult: Codable, Sendable, Hashable {
/// Detected orientation in degrees (0, 90, 180, or 270).
public let degrees: UInt32
/// Confidence score (0.0-1.0).
public let confidence: Float
public init(degrees: UInt32, confidence: Float) {
self.degrees = degrees
self.confidence = confidence
}
}
// MARK: - Internal FFI conversions for OrientationResult
internal extension OrientationResult {
init(_ rb: RustBridge.OrientationResultRef) throws {
self.degrees = rb.degrees()
self.confidence = rb.confidence()
}
func intoRust() throws -> RustBridge.OrientationResult {
return RustBridge.OrientationResult(self.degrees, self.confidence)
}
}
/// Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
public struct BBox: Codable, Sendable, Hashable {
public let x1: Float
public let y1: Float
public let x2: Float
public let y2: Float
public init(x1: Float, y1: Float, x2: Float, y2: Float) {
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
}
}
// MARK: - Internal FFI conversions for BBox
internal extension BBox {
init(_ rb: RustBridge.BBoxRef) throws {
self.x1 = rb.x1()
self.y1 = rb.y1()
self.x2 = rb.x2()
self.y2 = rb.y2()
}
func intoRust() throws -> RustBridge.BBox {
return RustBridge.BBox(self.x1, self.y1, self.x2, self.y2)
}
}
/// A single layout detection result.
public struct LayoutDetection: Codable, Sendable, Hashable {
public let className: LayoutClass
public let confidence: Float
public let bbox: BBox
public init(className: LayoutClass, confidence: Float, bbox: BBox) {
self.className = className
self.confidence = confidence
self.bbox = bbox
}
private enum CodingKeys: String, CodingKey {
case className = "class_name"
case confidence = "confidence"
case bbox = "bbox"
}
}
// MARK: - Internal FFI conversions for LayoutDetection
internal extension LayoutDetection {
init(_ rb: RustBridge.LayoutDetectionRef) throws {
self.className = LayoutClass(rawValue: rb.className().toString()) ?? { fatalError("Unknown LayoutClass: \(rb.className().toString())") }()
self.confidence = rb.confidence()
self.bbox = try BBox(rb.bbox())
}
func intoRust() throws -> RustBridge.LayoutDetection {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.layoutDetectionFromJson(json)
}
}
/// Pre-computed table markdown for a table detection region.
///
/// Produced by the TATR-based table structure recognizer and surfaced as part of
/// layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
/// so that consumers who do not enable `layout-detection` (ORT) can still reference
/// the type in their own code.
public struct RecognizedTable: Codable, Sendable, Hashable {
/// Detection bbox that this table corresponds to (for matching).
public let detectionBbox: BBox
/// Table cells as a 2D vector (rows × columns).
public let cells: [[String]]
/// Rendered markdown table.
public let markdown: String
public init(detectionBbox: BBox, cells: [[String]], markdown: String) {
self.detectionBbox = detectionBbox
self.cells = cells
self.markdown = markdown
}
private enum CodingKeys: String, CodingKey {
case detectionBbox = "detection_bbox"
case cells = "cells"
case markdown = "markdown"
}
}
// MARK: - Internal FFI conversions for RecognizedTable
internal extension RecognizedTable {
init(_ rb: RustBridge.RecognizedTableRef) throws {
self.detectionBbox = try BBox(rb.detectionBbox())
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
self.markdown = rb.markdown().toString()
}
func intoRust() throws -> RustBridge.RecognizedTable {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.recognizedTableFromJson(json)
}
}
/// Page-level detection result containing all detections and page metadata.
public struct DetectionResult: Codable, Sendable, Hashable {
public let pageWidth: UInt32
public let pageHeight: UInt32
public let detections: [LayoutDetection]
public init(pageWidth: UInt32, pageHeight: UInt32, detections: [LayoutDetection]) {
self.pageWidth = pageWidth
self.pageHeight = pageHeight
self.detections = detections
}
private enum CodingKeys: String, CodingKey {
case pageWidth = "page_width"
case pageHeight = "page_height"
case detections = "detections"
}
}
// MARK: - Internal FFI conversions for DetectionResult
internal extension DetectionResult {
init(_ rb: RustBridge.DetectionResultRef) throws {
self.pageWidth = rb.pageWidth()
self.pageHeight = rb.pageHeight()
self.detections = try rb.detections().map { try LayoutDetection($0) }
}
func intoRust() throws -> RustBridge.DetectionResult {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "{}"
return try RustBridge.detectionResultFromJson(json)
}
}
/// Embedded file descriptor extracted from the PDF name tree.
public typealias EmbeddedFile = RustBridge.EmbeddedFile
/// PDF-specific metadata.
///
/// Contains metadata fields specific to PDF documents that are not in the common
/// `Metadata` structure. Common fields like title, authors, keywords, and dates
/// are at the `Metadata` level.
public struct PdfMetadata: Codable, Sendable, Hashable {
/// PDF version (e.g., "1.7", "2.0")
public let pdfVersion: String?
/// PDF producer (application that created the PDF)
public let producer: String?
/// Whether the PDF is encrypted/password-protected
public let isEncrypted: Bool?
/// First page width in points (1/72 inch)
public let width: Int64?
/// First page height in points (1/72 inch)
public let height: Int64?
/// Total number of pages in the PDF document
public let pageCount: UInt32?
public init(pdfVersion: String? = nil, producer: String? = nil, isEncrypted: Bool? = nil, width: Int64? = nil, height: Int64? = nil, pageCount: UInt32? = nil) {
self.pdfVersion = pdfVersion
self.producer = producer
self.isEncrypted = isEncrypted
self.width = width
self.height = height
self.pageCount = pageCount
}
private enum CodingKeys: String, CodingKey {
case pdfVersion = "pdf_version"
case producer = "producer"
case isEncrypted = "is_encrypted"
case width = "width"
case height = "height"
case pageCount = "page_count"
}
public init(from decoder: any Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
self.pdfVersion = try container.decodeIfPresent(String.self, forKey: .pdfVersion) ?? nil
self.producer = try container.decodeIfPresent(String.self, forKey: .producer) ?? nil
self.isEncrypted = try container.decodeIfPresent(Bool.self, forKey: .isEncrypted) ?? nil
self.width = try container.decodeIfPresent(Int64.self, forKey: .width) ?? nil
self.height = try container.decodeIfPresent(Int64.self, forKey: .height) ?? nil
self.pageCount = try container.decodeIfPresent(UInt32.self, forKey: .pageCount) ?? nil
}
}
// MARK: - Internal FFI conversions for PdfMetadata
internal extension PdfMetadata {
init(_ rb: RustBridge.PdfMetadataRef) throws {
self.pdfVersion = rb.pdfVersion()?.toString()
self.producer = rb.producer()?.toString()
self.isEncrypted = rb.isEncrypted()
self.width = rb.width()
self.height = rb.height()
self.pageCount = rb.pageCount()
}
func intoRust() throws -> RustBridge.PdfMetadata {
return RustBridge.PdfMetadata(self.pdfVersion.map(RustString.init), self.producer.map(RustString.init), self.isEncrypted, self.width, self.height, self.pageCount)
}
}
/// ONNX Runtime execution provider type.
///
/// Determines which hardware backend is used for model inference.
/// `Auto` (default) selects the best available provider per platform.
public enum ExecutionProviderType: String, Codable, Sendable, Hashable {
/// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
case auto
/// CPU execution provider (always available).
case cpu
/// Apple CoreML (macOS/iOS Neural Engine + GPU).
case coreMl = "coreml"
/// NVIDIA CUDA GPU acceleration.
case cuda
/// NVIDIA TensorRT (optimized CUDA inference).
case tensorRt = "tensorrt"
}
extension ExecutionProviderType {
func intoRust() throws -> RustBridge.ExecutionProviderType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.executionProviderTypeFromJson(json)
}
}
/// Output format for extraction results.
///
/// Controls the format of the `content` field in `ExtractionResult`.
/// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
/// `Plain` returns the raw extracted text.
/// `Structured` returns JSON with full OCR element data including bounding
/// boxes and confidence scores.
public enum OutputFormat: Codable, Sendable, Hashable {
/// Plain text content only (default)
case plain
/// Markdown format
case markdown
/// Djot markup format
case djot
/// HTML format
case html
/// JSON tree format with heading-driven sections.
case json
/// Structured JSON format with full OCR element metadata.
case structured
/// Custom renderer registered via the RendererRegistry.
/// The string is the renderer name (e.g., "docx", "latex").
case custom(field0: String)
}
extension OutputFormat {
func intoRust() throws -> RustBridge.OutputFormat {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.outputFormatFromJson(json)
}
}
/// Built-in HTML theme selection.
public enum HtmlTheme: String, Codable, Sendable, Hashable {
/// Sensible defaults: system font stack, neutral colours, readable line
/// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
/// can override individual values.
case `default`
/// GitHub Markdown-inspired palette and spacing.
case gitHub = "github"
/// Dark background, light text.
case dark
/// Minimal light theme with generous whitespace.
case light
/// No built-in stylesheet emitted. CSS custom properties are still defined
/// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
case unstyled
}
extension HtmlTheme {
func intoRust() throws -> RustBridge.HtmlTheme {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.htmlThemeFromJson(json)
}
}
/// Which table structure recognition model to use.
///
/// Controls the model used for table cell detection within layout-detected
/// table regions. Wire format is snake_case in all serializers (JSON, TOML,
/// YAML).
public enum TableModel: String, Codable, Sendable, Hashable {
/// TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
case tatr
/// SLANeXT wired variant -- 365MB, optimized for bordered tables.
case slanetWired = "slanet_wired"
/// SLANeXT wireless variant -- 365MB, optimized for borderless tables.
case slanetWireless = "slanet_wireless"
/// SLANet-plus -- 7.78MB, lightweight general-purpose.
case slanetPlus = "slanet_plus"
/// Classifier-routed SLANeXT: auto-select wired/wireless per table.
/// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
case slanetAuto = "slanet_auto"
/// Disable table structure model inference entirely; use heuristic path only.
case disabled
}
extension TableModel {
func intoRust() throws -> RustBridge.TableModel {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.tableModelFromJson(json)
}
}
/// Type of text chunker to use.
///
/// # Variants
///
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
/// lower = more splits). Without an embedding, falls back to a
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
/// blank-line paragraphs) and merges groups into chunks capped at
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
/// fallback path. For best results, pair with an embedding model.
public enum ChunkerType: String, Codable, Sendable, Hashable {
case text
case markdown
case yaml
case semantic
}
extension ChunkerType {
func intoRust() throws -> RustBridge.ChunkerType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.chunkerTypeFromJson(json)
}
}
/// How chunk size is measured.
///
/// Defaults to `Characters` (Unicode character count). When using token-based sizing,
/// chunks are sized by token count according to the specified tokenizer.
///
/// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
/// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
/// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
public enum ChunkSizing: Codable, Sendable, Hashable {
/// Size measured in Unicode characters (default).
case characters
/// Size measured in tokens from a HuggingFace tokenizer.
case tokenizer(model: String, cacheDir: URL?)
private enum CodingKeys: String, CodingKey {
case type
case cacheDir = "cache_dir"
case model
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .type)
switch type {
case "characters":
self = .characters
case "tokenizer":
self = .tokenizer(model: try container.decode(String.self, forKey: .model), cacheDir: try container.decodeIfPresent(URL.self, forKey: .cacheDir))
default:
throw DecodingError.dataCorruptedError(
forKey: .type,
in: container,
debugDescription: "Unknown ChunkSizing type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .characters:
try container.encode("characters", forKey: .type)
case .tokenizer(let model, let cacheDir):
try container.encode("tokenizer", forKey: .type)
try container.encode(model, forKey: .model)
try container.encodeIfPresent(cacheDir, forKey: .cacheDir)
}
}
}
extension ChunkSizing {
func intoRust() throws -> RustBridge.ChunkSizing {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.chunkSizingFromJson(json)
}
}
/// Embedding model types supported by Kreuzberg.
public enum EmbeddingModelType: Codable, Sendable, Hashable {
/// Use a preset model configuration (recommended)
case preset(name: String)
/// Use a custom ONNX model from HuggingFace
case custom(modelId: String, dimensions: UInt)
/// Provider-hosted embedding model via liter-llm.
///
/// Uses the model specified in the nested `LlmConfig` (e.g.,
/// `"openai/text-embedding-3-small"`).
case llm(llm: LlmConfig)
/// In-process embedding backend registered via the plugin system.
///
/// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
/// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
/// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
/// into the registered backend during chunking and standalone embed requests
/// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
///
/// When this variant is selected, only the following [`EmbeddingConfig`] fields
/// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
/// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
/// `show_download_progress`, `acceleration`) are ignored the host owns the
/// model lifecycle.
///
/// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
/// is used, since there is no preset to look a chunk-size ceiling up against size your
/// context window via `max_characters` directly.
///
/// See `register_embedding_backend`.
case plugin(name: String)
private enum CodingKeys: String, CodingKey {
case type
case dimensions
case llm
case modelId = "model_id"
case name
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .type)
switch type {
case "preset":
self = .preset(name: try container.decode(String.self, forKey: .name))
case "custom":
self = .custom(modelId: try container.decode(String.self, forKey: .modelId), dimensions: try container.decode(UInt.self, forKey: .dimensions))
case "llm":
self = .llm(llm: try container.decode(LlmConfig.self, forKey: .llm))
case "plugin":
self = .plugin(name: try container.decode(String.self, forKey: .name))
default:
throw DecodingError.dataCorruptedError(
forKey: .type,
in: container,
debugDescription: "Unknown EmbeddingModelType type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .preset(let name):
try container.encode("preset", forKey: .type)
try container.encode(name, forKey: .name)
case .custom(let modelId, let dimensions):
try container.encode("custom", forKey: .type)
try container.encode(modelId, forKey: .modelId)
try container.encode(dimensions, forKey: .dimensions)
case .llm(let llm):
try container.encode("llm", forKey: .type)
try container.encode(llm, forKey: .llm)
case .plugin(let name):
try container.encode("plugin", forKey: .type)
try container.encode(name, forKey: .name)
}
}
}
extension EmbeddingModelType {
func intoRust() throws -> RustBridge.EmbeddingModelType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.embeddingModelTypeFromJson(json)
}
}
/// Content rendering mode for code extraction.
///
/// Controls how extracted code content is represented in the `content` field
/// of `ExtractionResult`.
public enum CodeContentMode: String, Codable, Sendable, Hashable {
/// Use TSLP semantic chunks as content (default).
case chunks
/// Use raw source code as content.
case raw
/// Emit function/class headings + docstrings (no code bodies).
case structure
}
extension CodeContentMode {
func intoRust() throws -> RustBridge.CodeContentMode {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.codeContentModeFromJson(json)
}
}
/// Type of list detection.
public typealias ListType = RustBridge.ListType
/// OCR backend types.
public enum OcrBackendType: String, Codable, Sendable, Hashable {
/// Tesseract OCR (native Rust binding)
case tesseract = "Tesseract"
/// EasyOCR (Python-based, via FFI)
case easyOcr = "EasyOCR"
/// PaddleOCR (Python-based, via FFI)
case paddleOcr = "PaddleOCR"
/// Custom/third-party OCR backend
case custom = "Custom"
}
extension OcrBackendType {
func intoRust() throws -> RustBridge.OcrBackendType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.ocrBackendTypeFromJson(json)
}
}
/// Processing stages for post-processors.
///
/// Post-processors are executed in stage order (Early Middle Late).
/// Use stages to control the order of post-processing operations.
public enum ProcessingStage: String, Codable, Sendable, Hashable {
/// Early stage - foundational processing.
///
/// Use for:
/// - Language detection
/// - Character encoding normalization
/// - Entity extraction (NER)
/// - Text quality scoring
case early = "Early"
/// Middle stage - content transformation.
///
/// Use for:
/// - Keyword extraction
/// - Token reduction
/// - Text summarization
/// - Semantic analysis
case middle = "Middle"
/// Late stage - final enrichment.
///
/// Use for:
/// - Custom user hooks
/// - Analytics/logging
/// - Final validation
/// - Output formatting
case late = "Late"
}
extension ProcessingStage {
func intoRust() throws -> RustBridge.ProcessingStage {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.processingStageFromJson(json)
}
}
public enum ReductionLevel: String, Codable, Sendable, Hashable {
case off = "Off"
case light = "Light"
case moderate = "Moderate"
case aggressive = "Aggressive"
case maximum = "Maximum"
}
extension ReductionLevel {
func intoRust() throws -> RustBridge.ReductionLevel {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.reductionLevelFromJson(json)
}
}
/// Type of PDF annotation.
public enum PdfAnnotationType: String, Codable, Sendable, Hashable {
/// Sticky note / text annotation
case text
/// Highlighted text region
case highlight
/// Hyperlink annotation
case link
/// Rubber stamp annotation
case stamp
/// Underline text markup
case underline
/// Strikeout text markup
case strikeOut = "strike_out"
/// Any other annotation type
case other
}
extension PdfAnnotationType {
func intoRust() throws -> RustBridge.PdfAnnotationType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.pdfAnnotationTypeFromJson(json)
}
}
/// Types of block-level elements in Djot.
public enum BlockType: String, Codable, Sendable, Hashable {
case paragraph
case heading
case blockquote
case codeBlock = "code_block"
case listItem = "list_item"
case orderedList = "ordered_list"
case bulletList = "bullet_list"
case taskList = "task_list"
case definitionList = "definition_list"
case definitionTerm = "definition_term"
case definitionDescription = "definition_description"
case div
case section
case thematicBreak = "thematic_break"
case rawBlock = "raw_block"
case mathDisplay = "math_display"
}
extension BlockType {
func intoRust() throws -> RustBridge.BlockType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.blockTypeFromJson(json)
}
}
/// Types of inline elements in Djot.
public enum InlineType: String, Codable, Sendable, Hashable {
case text
case strong
case emphasis
case highlight
case `subscript`
case superscript
case insert
case delete
case code
case link
case image
case span
case math
case rawInline = "raw_inline"
case footnoteRef = "footnote_ref"
case symbol
}
extension InlineType {
func intoRust() throws -> RustBridge.InlineType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.inlineTypeFromJson(json)
}
}
/// Semantic kind of a relationship between document elements.
public enum RelationshipKind: String, Codable, Sendable, Hashable {
/// Footnote marker -> footnote definition.
case footnoteReference = "footnote_reference"
/// Citation marker -> bibliography entry.
case citationReference = "citation_reference"
/// Internal anchor link (`#id`) -> target heading/element.
case internalLink = "internal_link"
/// Caption paragraph -> figure/table it describes.
case caption
/// Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
case label
/// TOC entry -> target section.
case tocEntry = "toc_entry"
/// Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
case crossReference = "cross_reference"
}
extension RelationshipKind {
func intoRust() throws -> RustBridge.RelationshipKind {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.relationshipKindFromJson(json)
}
}
/// Content layer classification for document nodes.
///
/// Replaces separate body/furniture arrays with per-node granularity.
public enum ContentLayer: String, Codable, Sendable, Hashable {
/// Main document body content.
case body
/// Page/section header (running header).
case header
/// Page/section footer (running footer).
case footer
/// Footnote content.
case footnote
}
extension ContentLayer {
func intoRust() throws -> RustBridge.ContentLayer {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.contentLayerFromJson(json)
}
}
/// Tagged enum for node content. Each variant carries only type-specific data.
///
/// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
/// Go/Java/TypeScript bindings.
public enum NodeContent: Codable, Sendable, Hashable {
/// Document title.
case title(text: String)
/// Section heading with level (1-6).
case heading(level: UInt8, text: String)
/// Body text paragraph.
case paragraph(text: String)
/// List container children are `ListItem` nodes.
case list(ordered: Bool)
/// Individual list item.
case listItem(text: String)
/// Table with structured cell grid.
case table(grid: TableGrid)
/// Image reference.
case image(description: String?, imageIndex: UInt32?, src: String?)
/// Code block.
case code(text: String, language: String?)
/// Block quote container, children carry the quoted content.
case quote
/// Mathematical formula / equation.
case formula(text: String)
/// Footnote reference content.
case footnote(text: String)
/// Logical grouping container (section, key-value area).
///
/// `heading_level` + `heading_text` capture the section heading directly
/// rather than relying on a first-child positional convention.
case group(label: String?, headingLevel: UInt8?, headingText: String?)
/// Page break marker.
case pageBreak
/// Presentation slide container children are the slide's content nodes.
case slide(number: UInt32, title: String?)
/// Definition list container children are `DefinitionItem` nodes.
case definitionList
/// Individual definition list entry with term and definition.
case definitionItem(term: String, definition: String)
/// Citation or bibliographic reference.
case citation(key: String, text: String)
/// Admonition / callout container (note, warning, tip, etc.).
///
/// Children carry the admonition body content.
case admonition(kind: String, title: String?)
/// Raw block preserved verbatim from the source format.
///
/// Used for content that cannot be mapped to a semantic node type
/// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
case rawBlock(format: String, content: String)
/// Structured metadata block (email headers, YAML frontmatter, etc.).
case metadataBlock(entries: [[String]])
private enum CodingKeys: String, CodingKey {
case node_type
case content
case definition
case description
case entries
case format
case grid
case headingLevel = "heading_level"
case headingText = "heading_text"
case imageIndex = "image_index"
case key
case kind
case label
case language
case level
case number
case ordered
case src
case term
case text
case title
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .node_type)
switch type {
case "title":
self = .title(text: try container.decode(String.self, forKey: .text))
case "heading":
self = .heading(level: try container.decode(UInt8.self, forKey: .level), text: try container.decode(String.self, forKey: .text))
case "paragraph":
self = .paragraph(text: try container.decode(String.self, forKey: .text))
case "list":
self = .list(ordered: try container.decode(Bool.self, forKey: .ordered))
case "list_item":
self = .listItem(text: try container.decode(String.self, forKey: .text))
case "table":
self = .table(grid: try container.decode(TableGrid.self, forKey: .grid))
case "image":
self = .image(description: try container.decodeIfPresent(String.self, forKey: .description), imageIndex: try container.decodeIfPresent(UInt32.self, forKey: .imageIndex), src: try container.decodeIfPresent(String.self, forKey: .src))
case "code":
self = .code(text: try container.decode(String.self, forKey: .text), language: try container.decodeIfPresent(String.self, forKey: .language))
case "quote":
self = .quote
case "formula":
self = .formula(text: try container.decode(String.self, forKey: .text))
case "footnote":
self = .footnote(text: try container.decode(String.self, forKey: .text))
case "group":
self = .group(label: try container.decodeIfPresent(String.self, forKey: .label), headingLevel: try container.decodeIfPresent(UInt8.self, forKey: .headingLevel), headingText: try container.decodeIfPresent(String.self, forKey: .headingText))
case "page_break":
self = .pageBreak
case "slide":
self = .slide(number: try container.decode(UInt32.self, forKey: .number), title: try container.decodeIfPresent(String.self, forKey: .title))
case "definition_list":
self = .definitionList
case "definition_item":
self = .definitionItem(term: try container.decode(String.self, forKey: .term), definition: try container.decode(String.self, forKey: .definition))
case "citation":
self = .citation(key: try container.decode(String.self, forKey: .key), text: try container.decode(String.self, forKey: .text))
case "admonition":
self = .admonition(kind: try container.decode(String.self, forKey: .kind), title: try container.decodeIfPresent(String.self, forKey: .title))
case "raw_block":
self = .rawBlock(format: try container.decode(String.self, forKey: .format), content: try container.decode(String.self, forKey: .content))
case "metadata_block":
self = .metadataBlock(entries: try container.decode([[String]].self, forKey: .entries))
default:
throw DecodingError.dataCorruptedError(
forKey: .node_type,
in: container,
debugDescription: "Unknown NodeContent type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .title(let text):
try container.encode("title", forKey: .node_type)
try container.encode(text, forKey: .text)
case .heading(let level, let text):
try container.encode("heading", forKey: .node_type)
try container.encode(level, forKey: .level)
try container.encode(text, forKey: .text)
case .paragraph(let text):
try container.encode("paragraph", forKey: .node_type)
try container.encode(text, forKey: .text)
case .list(let ordered):
try container.encode("list", forKey: .node_type)
try container.encode(ordered, forKey: .ordered)
case .listItem(let text):
try container.encode("list_item", forKey: .node_type)
try container.encode(text, forKey: .text)
case .table(let grid):
try container.encode("table", forKey: .node_type)
try container.encode(grid, forKey: .grid)
case .image(let description, let imageIndex, let src):
try container.encode("image", forKey: .node_type)
try container.encodeIfPresent(description, forKey: .description)
try container.encodeIfPresent(imageIndex, forKey: .imageIndex)
try container.encodeIfPresent(src, forKey: .src)
case .code(let text, let language):
try container.encode("code", forKey: .node_type)
try container.encode(text, forKey: .text)
try container.encodeIfPresent(language, forKey: .language)
case .quote:
try container.encode("quote", forKey: .node_type)
case .formula(let text):
try container.encode("formula", forKey: .node_type)
try container.encode(text, forKey: .text)
case .footnote(let text):
try container.encode("footnote", forKey: .node_type)
try container.encode(text, forKey: .text)
case .group(let label, let headingLevel, let headingText):
try container.encode("group", forKey: .node_type)
try container.encodeIfPresent(label, forKey: .label)
try container.encodeIfPresent(headingLevel, forKey: .headingLevel)
try container.encodeIfPresent(headingText, forKey: .headingText)
case .pageBreak:
try container.encode("page_break", forKey: .node_type)
case .slide(let number, let title):
try container.encode("slide", forKey: .node_type)
try container.encode(number, forKey: .number)
try container.encodeIfPresent(title, forKey: .title)
case .definitionList:
try container.encode("definition_list", forKey: .node_type)
case .definitionItem(let term, let definition):
try container.encode("definition_item", forKey: .node_type)
try container.encode(term, forKey: .term)
try container.encode(definition, forKey: .definition)
case .citation(let key, let text):
try container.encode("citation", forKey: .node_type)
try container.encode(key, forKey: .key)
try container.encode(text, forKey: .text)
case .admonition(let kind, let title):
try container.encode("admonition", forKey: .node_type)
try container.encode(kind, forKey: .kind)
try container.encodeIfPresent(title, forKey: .title)
case .rawBlock(let format, let content):
try container.encode("raw_block", forKey: .node_type)
try container.encode(format, forKey: .format)
try container.encode(content, forKey: .content)
case .metadataBlock(let entries):
try container.encode("metadata_block", forKey: .node_type)
try container.encode(entries, forKey: .entries)
}
}
}
extension NodeContent {
func intoRust() throws -> RustBridge.NodeContent {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.nodeContentFromJson(json)
}
}
/// Types of inline text annotations.
public enum AnnotationKind: Codable, Sendable, Hashable {
case bold
case italic
case underline
case strikethrough
case code
case `subscript`
case superscript
case link(url: String, title: String?)
/// Highlighted text (PDF highlights, HTML `<mark>`).
case highlight
/// Text color (CSS-compatible value, e.g. "#ff0000", "red").
case color(value: String)
/// Font size with units (e.g. "12pt", "1.2em", "16px").
case fontSize(value: String)
/// Extensible annotation for format-specific styling.
case custom(name: String, value: String?)
private enum CodingKeys: String, CodingKey {
case annotation_type
case name
case title
case url
case value
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .annotation_type)
switch type {
case "bold":
self = .bold
case "italic":
self = .italic
case "underline":
self = .underline
case "strikethrough":
self = .strikethrough
case "code":
self = .code
case "subscript":
self = .`subscript`
case "superscript":
self = .superscript
case "link":
self = .link(url: try container.decode(String.self, forKey: .url), title: try container.decodeIfPresent(String.self, forKey: .title))
case "highlight":
self = .highlight
case "color":
self = .color(value: try container.decode(String.self, forKey: .value))
case "font_size":
self = .fontSize(value: try container.decode(String.self, forKey: .value))
case "custom":
self = .custom(name: try container.decode(String.self, forKey: .name), value: try container.decodeIfPresent(String.self, forKey: .value))
default:
throw DecodingError.dataCorruptedError(
forKey: .annotation_type,
in: container,
debugDescription: "Unknown AnnotationKind type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .bold:
try container.encode("bold", forKey: .annotation_type)
case .italic:
try container.encode("italic", forKey: .annotation_type)
case .underline:
try container.encode("underline", forKey: .annotation_type)
case .strikethrough:
try container.encode("strikethrough", forKey: .annotation_type)
case .code:
try container.encode("code", forKey: .annotation_type)
case .`subscript`:
try container.encode("subscript", forKey: .annotation_type)
case .superscript:
try container.encode("superscript", forKey: .annotation_type)
case .link(let url, let title):
try container.encode("link", forKey: .annotation_type)
try container.encode(url, forKey: .url)
try container.encodeIfPresent(title, forKey: .title)
case .highlight:
try container.encode("highlight", forKey: .annotation_type)
case .color(let value):
try container.encode("color", forKey: .annotation_type)
try container.encode(value, forKey: .value)
case .fontSize(let value):
try container.encode("font_size", forKey: .annotation_type)
try container.encode(value, forKey: .value)
case .custom(let name, let value):
try container.encode("custom", forKey: .annotation_type)
try container.encode(name, forKey: .name)
try container.encodeIfPresent(value, forKey: .value)
}
}
}
extension AnnotationKind {
func intoRust() throws -> RustBridge.AnnotationKind {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.annotationKindFromJson(json)
}
}
/// How the extracted text was produced.
public enum ExtractionMethod: String, Codable, Sendable, Hashable {
case native
case ocr
case mixed
}
extension ExtractionMethod {
func intoRust() throws -> RustBridge.ExtractionMethod {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.extractionMethodFromJson(json)
}
}
/// Semantic structural classification of a text chunk.
///
/// Assigned by the heuristic classifier in `chunking::classifier`.
/// Defaults to `Unknown` when no rule matches.
/// Designed to be extended in future versions without breaking changes.
public enum ChunkType: String, Codable, Sendable, Hashable {
/// Section heading or document title.
case heading
/// Party list: names, addresses, and signatories.
case partyList = "party_list"
/// Definition clause ("X means", "X shall mean").
case definitions
/// Operative clause containing legal/contractual action verbs.
case operativeClause = "operative_clause"
/// Signature block with signatures, names, and dates.
case signatureBlock = "signature_block"
/// Schedule, annex, appendix, or exhibit section.
case schedule
/// Table-like content with aligned columns or repeated patterns.
case tableLike = "table_like"
/// Mathematical formula or equation.
case formula
/// Code block or preformatted content.
case codeBlock = "code_block"
/// Embedded or referenced image content.
case image
/// Organizational chart or hierarchy diagram.
case orgChart = "org_chart"
/// Diagram, figure, or visual illustration.
case diagram
/// Unclassified or mixed content.
case unknown
}
extension ChunkType {
func intoRust() throws -> RustBridge.ChunkType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.chunkTypeFromJson(json)
}
}
/// Heuristic classification of what an image likely depicts.
public enum ImageKind: String, Codable, Sendable, Hashable {
/// Photographic image (natural scene, photograph)
case photograph
/// Technical or schematic diagram
case diagram
/// Chart, graph, or plot
case chart
/// Freehand or technical drawing
case drawing
/// Text-heavy image (scanned text, document)
case textBlock = "text_block"
/// Decorative element or border
case decoration
/// Logo or brand mark
case logo
/// Small icon
case icon
/// Fragment of a larger tiled image (tile of a technical drawing)
case tileFragment = "tile_fragment"
/// Mask or transparency map
case mask
/// Full-page render produced during OCR preprocessing; used as a citation thumbnail.
case pageRaster = "page_raster"
/// Could not classify with reasonable confidence
case unknown
}
extension ImageKind {
func intoRust() throws -> RustBridge.ImageKind {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.imageKindFromJson(json)
}
}
/// Result-shape selection for extraction results.
///
/// Distinct from `OutputFormat` (which controls rendering Plain, Markdown,
/// HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
/// blob vs. an element-based decomposition.
public enum ResultFormat: String, Codable, Sendable, Hashable {
/// Unified format with all content in `content` field
case unified
/// Element-based format with semantic element extraction
case elementBased = "element_based"
}
extension ResultFormat {
func intoRust() throws -> RustBridge.ResultFormat {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.resultFormatFromJson(json)
}
}
/// Semantic element type classification.
///
/// Categorizes text content into semantic units for downstream processing.
/// Supports the element types commonly found in Unstructured documents.
public enum ElementType: String, Codable, Sendable, Hashable {
/// Document title
case title
/// Main narrative text body
case narrativeText = "narrative_text"
/// Section heading
case heading
/// List item (bullet, numbered, etc.)
case listItem = "list_item"
/// Table element
case table
/// Image element
case image
/// Page break marker
case pageBreak = "page_break"
/// Code block
case codeBlock = "code_block"
/// Block quote
case blockQuote = "block_quote"
/// Footer text
case footer
/// Header text
case header
}
extension ElementType {
func intoRust() throws -> RustBridge.ElementType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.elementTypeFromJson(json)
}
}
/// Format-specific metadata (discriminated union).
///
/// Only one format type can exist per extraction result. This provides
/// type-safe, clean metadata without nested optionals.
public typealias FormatMetadata = RustBridge.FormatMetadata
/// Text direction enumeration for HTML documents.
public enum TextDirection: String, Codable, Sendable, Hashable {
/// Left-to-right text direction
case leftToRight = "ltr"
/// Right-to-left text direction
case rightToLeft = "rtl"
/// Automatic text direction detection
case auto
}
extension TextDirection {
func intoRust() throws -> RustBridge.TextDirection {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.textDirectionFromJson(json)
}
}
/// Link type classification.
public enum LinkType: String, Codable, Sendable, Hashable {
/// Anchor link (#section)
case anchor
/// Internal link (same domain)
case `internal`
/// External link (different domain)
case external
/// Email link (mailto:)
case email
/// Phone link (tel:)
case phone
/// Other link type
case other
}
extension LinkType {
func intoRust() throws -> RustBridge.LinkType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.linkTypeFromJson(json)
}
}
/// Image type classification.
public enum ImageType: String, Codable, Sendable, Hashable {
/// Data URI image
case dataUri = "data-uri"
/// Inline SVG
case inlineSvg = "inline-svg"
/// External image URL
case external
/// Relative path image
case relative
}
extension ImageType {
func intoRust() throws -> RustBridge.ImageType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.imageTypeFromJson(json)
}
}
/// Structured data type classification.
public enum StructuredDataType: String, Codable, Sendable, Hashable {
/// JSON-LD structured data
case jsonLd = "json-ld"
/// Microdata
case microdata
/// RDFa
case rdFa = "rdfa"
}
extension StructuredDataType {
func intoRust() throws -> RustBridge.StructuredDataType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.structuredDataTypeFromJson(json)
}
}
/// Bounding geometry for an OCR element.
///
/// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
/// (from PaddleOCR and rotated text detection).
public enum OcrBoundingGeometry: Codable, Sendable, Hashable {
/// Axis-aligned bounding box (typical for Tesseract output).
case rectangle(left: UInt32, top: UInt32, width: UInt32, height: UInt32)
/// 4-point quadrilateral for rotated/skewed text (PaddleOCR).
///
/// Points are in clockwise order starting from top-left:
/// `[top_left, top_right, bottom_right, bottom_left]`
case quadrilateral(points: String)
private enum CodingKeys: String, CodingKey {
case type
case height
case left
case points
case top
case width
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .type)
switch type {
case "rectangle":
self = .rectangle(left: try container.decode(UInt32.self, forKey: .left), top: try container.decode(UInt32.self, forKey: .top), width: try container.decode(UInt32.self, forKey: .width), height: try container.decode(UInt32.self, forKey: .height))
case "quadrilateral":
self = .quadrilateral(points: try container.decode(String.self, forKey: .points))
default:
throw DecodingError.dataCorruptedError(
forKey: .type,
in: container,
debugDescription: "Unknown OcrBoundingGeometry type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .rectangle(let left, let top, let width, let height):
try container.encode("rectangle", forKey: .type)
try container.encode(left, forKey: .left)
try container.encode(top, forKey: .top)
try container.encode(width, forKey: .width)
try container.encode(height, forKey: .height)
case .quadrilateral(let points):
try container.encode("quadrilateral", forKey: .type)
try container.encode(points, forKey: .points)
}
}
}
extension OcrBoundingGeometry {
func intoRust() throws -> RustBridge.OcrBoundingGeometry {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.ocrBoundingGeometryFromJson(json)
}
}
/// Hierarchical level of an OCR element.
///
/// Maps to Tesseract's page segmentation hierarchy and provides
/// equivalent semantics for PaddleOCR.
public enum OcrElementLevel: String, Codable, Sendable, Hashable {
/// Individual word
case word
/// Line of text (default for PaddleOCR)
case line
/// Paragraph or text block
case block
/// Page-level element
case page
}
extension OcrElementLevel {
func intoRust() throws -> RustBridge.OcrElementLevel {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.ocrElementLevelFromJson(json)
}
}
/// Type of paginated unit in a document.
///
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
public enum PageUnitType: String, Codable, Sendable, Hashable {
/// Standard document pages (PDF, DOCX, images)
case page
/// Presentation slides (PPTX, ODP)
case slide
/// Spreadsheet sheets (XLSX, ODS)
case sheet
}
extension PageUnitType {
func intoRust() throws -> RustBridge.PageUnitType {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.pageUnitTypeFromJson(json)
}
}
/// A single line in a unified-diff hunk.
///
/// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
/// reference it unconditionally, without requiring the `diff` Cargo feature.
/// `crate::diff` re-exports this type verbatim.
public enum DiffLine: Codable, Sendable, Hashable {
/// Unchanged context line.
case context(field0: String)
/// Line added in the "after" version.
case added(field0: String)
/// Line removed from the "before" version.
case removed(field0: String)
private enum CodingKeys: String, CodingKey {
case kind
case field0 = "_0"
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .kind)
switch type {
case "context":
self = .context(field0: try container.decode(String.self, forKey: .field0))
case "added":
self = .added(field0: try container.decode(String.self, forKey: .field0))
case "removed":
self = .removed(field0: try container.decode(String.self, forKey: .field0))
default:
throw DecodingError.dataCorruptedError(
forKey: .kind,
in: container,
debugDescription: "Unknown DiffLine type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .context(let field0):
try container.encode("context", forKey: .kind)
try container.encode(field0, forKey: .field0)
case .added(let field0):
try container.encode("added", forKey: .kind)
try container.encode(field0, forKey: .field0)
case .removed(let field0):
try container.encode("removed", forKey: .kind)
try container.encode(field0, forKey: .field0)
}
}
}
extension DiffLine {
func intoRust() throws -> RustBridge.DiffLine {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.diffLineFromJson(json)
}
}
/// Semantic classification of a tracked change.
public enum RevisionKind: String, Codable, Sendable, Hashable {
/// Text or content was inserted.
case insertion
/// Text or content was deleted.
case deletion
/// Run-level formatting (font, size, colour, ) was changed.
case formatChange = "format_change"
/// A reviewer comment or annotation.
case comment
}
extension RevisionKind {
func intoRust() throws -> RustBridge.RevisionKind {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.revisionKindFromJson(json)
}
}
/// Best-effort document location for a revision.
public enum RevisionAnchor: Codable, Sendable, Hashable {
/// Body paragraph, identified by its zero-based index in the document flow.
case paragraph(index: UInt)
/// Cell inside a table.
case tableCell(row: UInt, col: UInt, tableIndex: UInt)
/// Page, identified by its zero-based index.
case page(index: UInt)
/// Presentation slide, identified by its zero-based index.
case slide(index: UInt)
/// Spreadsheet cell or range, identified by sheet index and optional name.
case sheet(index: UInt, name: String?)
private enum CodingKeys: String, CodingKey {
case type
case col
case index
case name
case row
case tableIndex = "table_index"
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let type = try container.decode(String.self, forKey: .type)
switch type {
case "paragraph":
self = .paragraph(index: try container.decode(UInt.self, forKey: .index))
case "table_cell":
self = .tableCell(row: try container.decode(UInt.self, forKey: .row), col: try container.decode(UInt.self, forKey: .col), tableIndex: try container.decode(UInt.self, forKey: .tableIndex))
case "page":
self = .page(index: try container.decode(UInt.self, forKey: .index))
case "slide":
self = .slide(index: try container.decode(UInt.self, forKey: .index))
case "sheet":
self = .sheet(index: try container.decode(UInt.self, forKey: .index), name: try container.decodeIfPresent(String.self, forKey: .name))
default:
throw DecodingError.dataCorruptedError(
forKey: .type,
in: container,
debugDescription: "Unknown RevisionAnchor type: \(type)"
)
}
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
switch self {
case .paragraph(let index):
try container.encode("paragraph", forKey: .type)
try container.encode(index, forKey: .index)
case .tableCell(let row, let col, let tableIndex):
try container.encode("table_cell", forKey: .type)
try container.encode(row, forKey: .row)
try container.encode(col, forKey: .col)
try container.encode(tableIndex, forKey: .tableIndex)
case .page(let index):
try container.encode("page", forKey: .type)
try container.encode(index, forKey: .index)
case .slide(let index):
try container.encode("slide", forKey: .type)
try container.encode(index, forKey: .index)
case .sheet(let index, let name):
try container.encode("sheet", forKey: .type)
try container.encode(index, forKey: .index)
try container.encodeIfPresent(name, forKey: .name)
}
}
}
extension RevisionAnchor {
func intoRust() throws -> RustBridge.RevisionAnchor {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.revisionAnchorFromJson(json)
}
}
/// Semantic classification of an extracted URI.
public enum UriKind: String, Codable, Sendable, Hashable {
/// A clickable hyperlink (web URL, file link).
case hyperlink
/// An image or media resource reference.
case image
/// An internal anchor or cross-reference target.
case anchor
/// A citation or bibliographic reference (DOI, academic ref).
case citation
/// A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
case reference
/// An email address (`mailto:` link or bare email).
case email
}
extension UriKind {
func intoRust() throws -> RustBridge.UriKind {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.uriKindFromJson(json)
}
}
/// Keyword algorithm selection.
public enum KeywordAlgorithm: String, Codable, Sendable, Hashable {
/// YAKE (Yet Another Keyword Extractor) - statistical approach
case yake
/// RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
case rake
}
extension KeywordAlgorithm {
func intoRust() throws -> RustBridge.KeywordAlgorithm {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.keywordAlgorithmFromJson(json)
}
}
/// Page Segmentation Mode for Tesseract OCR
public enum PSMMode: String, Codable, Sendable, Hashable {
case osdOnly = "OsdOnly"
case autoOsd = "AutoOsd"
case autoOnly = "AutoOnly"
case auto = "Auto"
case singleColumn = "SingleColumn"
case singleBlockVertical = "SingleBlockVertical"
case singleBlock = "SingleBlock"
case singleLine = "SingleLine"
case singleWord = "SingleWord"
case circleWord = "CircleWord"
case singleChar = "SingleChar"
}
extension PSMMode {
func intoRust() throws -> RustBridge.PSMMode {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.psmModeFromJson(json)
}
}
/// Supported languages in PaddleOCR.
///
/// Maps user-friendly language codes to paddle-ocr-rs language identifiers.
public enum PaddleLanguage: String, Codable, Sendable, Hashable {
/// English
case english = "English"
/// Simplified Chinese
case chinese = "Chinese"
/// Japanese
case japanese = "Japanese"
/// Korean
case korean = "Korean"
/// German
case german = "German"
/// French
case french = "French"
/// Latin script (covers most European languages)
case latin = "Latin"
/// Cyrillic (Russian and related)
case cyrillic = "Cyrillic"
/// Traditional Chinese
case traditionalChinese = "TraditionalChinese"
/// Thai
case thai = "Thai"
/// Greek
case greek = "Greek"
/// East Slavic (Russian, Ukrainian, Belarusian)
case eastSlavic = "EastSlavic"
/// Arabic (Arabic, Persian, Urdu)
case arabic = "Arabic"
/// Devanagari (Hindi, Marathi, Sanskrit, Nepali)
case devanagari = "Devanagari"
/// Tamil
case tamil = "Tamil"
/// Telugu
case telugu = "Telugu"
}
extension PaddleLanguage {
func intoRust() throws -> RustBridge.PaddleLanguage {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.paddleLanguageFromJson(json)
}
}
/// The 17 canonical document layout classes.
///
/// All model backends (RT-DETR, YOLO, etc.) map their native class IDs
/// to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
/// map to the closest equivalent.
///
/// Wire format is snake_case in all serializers (JSON, TOML, YAML).
public enum LayoutClass: String, Codable, Sendable, Hashable {
case caption
case footnote
case formula
case listItem = "list_item"
case pageFooter = "page_footer"
case pageHeader = "page_header"
case picture
case sectionHeader = "section_header"
case table
case text
case title
case documentIndex = "document_index"
case code
case checkboxSelected = "checkbox_selected"
case checkboxUnselected = "checkbox_unselected"
case form
case keyValueRegion = "key_value_region"
}
extension LayoutClass {
func intoRust() throws -> RustBridge.LayoutClass {
let data = try JSONEncoder().encode(self)
let json = String(data: data, encoding: .utf8) ?? "null"
return try RustBridge.layoutClassFromJson(json)
}
}
/// Main error type for all Kreuzberg operations.
///
/// All errors in Kreuzberg use this enum, which preserves error chains
/// and provides context for debugging.
///
/// # Variants
///
/// - `Io` - File system and I/O errors (always bubble up)
/// - `Parsing` - Document parsing errors (corrupt files, unsupported features)
/// - `Ocr` - OCR processing errors
/// - `Validation` - Input validation errors (invalid paths, config, parameters)
/// - `Cache` - Cache operation errors (non-fatal, can be ignored)
/// - `ImageProcessing` - Image manipulation errors
/// - `Serialization` - JSON/MessagePack serialization errors
/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
/// - `Plugin` - Plugin-specific errors
/// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
/// - `UnsupportedFormat` - Unsupported MIME type or file format
/// - `Other` - Catch-all for uncommon errors
public enum KreuzbergError: Swift.Error {
case io(message: String, field0: String)
case parsing(message: String)
case ocr(message: String)
case validation(message: String)
case cache(message: String)
case imageProcessing(message: String)
case serialization(message: String)
case missingDependency(message: String, field0: String)
case plugin(message: String, pluginName: String)
case lockPoisoned(message: String, field0: String)
case unsupportedFormat(message: String, field0: String)
case embedding(message: String)
case timeout(message: String, elapsedMs: UInt64, limitMs: UInt64)
case cancelled
case security(message: String)
case other(message: String, field0: String)
}
// MARK: - Convenience Wrapper Functions
// These wrappers bridge String / [UInt8] inputs to RustBridge's
// RustVec<UInt8> requirement. The config parameter must be a fully
// constructed opaque type (built via the generated initializer);
// JSON-config decoding is not available because swift-bridge opaque
// proxy classes are not Codable Swift structs.
/// Converts a Swift `[UInt8]` array to a `RustVec<UInt8>` by pushing each byte.
/// swift-bridge's `RustVec<T>` runtime only exposes `init()` and `push(value:)`;
/// no array-initializer shorthand exists.
private func makeByteVec(_ bytes: [UInt8]) -> RustVec<UInt8> {
let vec = RustVec<UInt8>()
for b in bytes { vec.push(value: b) }
return vec
}
/// Convenience overload: accepts a UTF-8 `String` and converts it to bytes.
public func extractBytes(
content: String,
mimeType: String
,
config: ExtractionConfig
) throws -> ExtractionResult {
return try extractBytesSync(makeByteVec(Array(content.utf8)), mimeType
, config
)
}
/// Convenience overload: accepts a `[UInt8]` byte array.
public func extractBytes(
content: [UInt8],
mimeType: String
,
config: ExtractionConfig
) throws -> ExtractionResult {
return try extractBytesSync(makeByteVec(content), mimeType
, config
)
}
/// Convenience overload: accepts a file path as a `String`.
public func extractFile(
path: String,
mimeType: String? = nil
,
config: ExtractionConfig
) throws -> ExtractionResult {
return try extractFileSync(path, mimeType
, config
)
}
// MARK: - JSON-String Convenience Overloads
// These overloads accept JSON-encoded config parameters and decode them automatically.
// Enables e2e tests to pass JSON strings directly without typed config construction.
/// Resolves a string argument as either a file path or literal UTF-8 content.
/// Searches: current working directory, ALEF_TEST_DOCUMENTS_DIR env var,
/// and ancestor `test_documents/` or `fixtures/` directories (up to 16 levels).
/// If no file is found, treats the string as UTF-8 content and returns its bytes.
private func _loadBytesFromPathOrUtf8(_ pathOrContent: String) throws -> [UInt8] {
let fm = FileManager.default
var roots: [String] = [fm.currentDirectoryPath]
if let envRoot = ProcessInfo.processInfo.environment["ALEF_TEST_DOCUMENTS_DIR"] {
roots.append(envRoot)
}
var walker = URL(fileURLWithPath: fm.currentDirectoryPath)
for _ in 0..<16 {
roots.append(walker.appendingPathComponent("test_documents").path)
roots.append(walker.appendingPathComponent("fixtures").path)
let parent = walker.deletingLastPathComponent()
if parent.path == walker.path { break }
walker = parent
}
let candidates = [pathOrContent] + roots.map { ($0 as NSString).appendingPathComponent(pathOrContent) }
for path in candidates {
if fm.fileExists(atPath: path), let data = try? Data(contentsOf: URL(fileURLWithPath: path)) {
return [UInt8](data)
}
}
return [UInt8](pathOrContent.utf8)
}
public func extractBytes(_ content: [UInt8], _ mimeType: String, _ configJson: String) async throws -> ExtractionResult {
let config = try extractionConfigFromJson(configJson)
return try await extractBytes(content: content, mimeType: mimeType, config: config)
}
public func extractFile(_ path: String, _ mimeType: String?, _ configJson: String) async throws -> ExtractionResult {
let config = try extractionConfigFromJson(configJson)
return try await extractFile(path: path, mimeType: mimeType, config: config)
}
public func extractFileSync(_ path: String, _ mimeType: String?, _ configJson: String) throws -> ExtractionResult {
let config = try extractionConfigFromJson(configJson)
return try extractFileSync(path: path, mimeType: mimeType, config: config)
}
public func extractBytesSync(_ content: [UInt8], _ mimeType: String, _ configJson: String) throws -> ExtractionResult {
let config = try extractionConfigFromJson(configJson)
return try extractBytesSync(content: content, mimeType: mimeType, config: config)
}
public func batchExtractFilesSync(_ items: [BatchFileItem], _ configJson: String) throws -> [ExtractionResult] {
let config = try extractionConfigFromJson(configJson)
return try batchExtractFilesSync(items: items, config: config)
}
public func batchExtractBytesSync(_ items: [BatchBytesItem], _ configJson: String) throws -> [ExtractionResult] {
let config = try extractionConfigFromJson(configJson)
return try batchExtractBytesSync(items: items, config: config)
}
public func batchExtractFiles(_ items: [BatchFileItem], _ configJson: String) async throws -> [ExtractionResult] {
let config = try extractionConfigFromJson(configJson)
return try await batchExtractFiles(items: items, config: config)
}
public func batchExtractBytes(_ items: [BatchBytesItem], _ configJson: String) async throws -> [ExtractionResult] {
let config = try extractionConfigFromJson(configJson)
return try await batchExtractBytes(items: items, config: config)
}
public func compare(_ configJson: String, _ b: ExtractionResult, _ opts: DiffOptions) throws -> ExtractionDiff {
let config = try extractionResultFromJson(configJson)
return try compare(a: config, b: b, opts: opts)
}
public func compare(_ a: ExtractionResult, _ configJson: String, _ opts: DiffOptions) throws -> ExtractionDiff {
let config = try extractionResultFromJson(configJson)
return try compare(a: a, b: config, opts: opts)
}
public func compare(_ a: ExtractionResult, _ b: ExtractionResult, _ configJson: String) throws -> ExtractionDiff {
let config = try diffOptionsFromJson(configJson)
return try compare(a: a, b: b, opts: config)
}
public func embedTextsAsync(_ texts: [String], _ configJson: String) async throws -> [[Float]] {
let config = try embeddingConfigFromJson(configJson)
return try await embedTextsAsync(texts: texts, config: config)
}
public func embedTexts(_ texts: [String], _ configJson: String) throws -> [[Float]] {
let config = try embeddingConfigFromJson(configJson)
return try embedTexts(texts: texts, config: config)
}
// MARK: - From-JSON Helpers
// Public helpers that decode JSON into first-class Swift types.
// First-class struct types (Codable) use JSONDecoder directly.
// Opaque RustBridge types forward to RustBridge.
public func cacheStatsFromJson(_ json: String) throws -> CacheStats {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CacheStats.self, from: data)
}
public func accelerationConfigFromJson(_ json: String) throws -> AccelerationConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(AccelerationConfig.self, from: data)
}
public func contentFilterConfigFromJson(_ json: String) throws -> ContentFilterConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ContentFilterConfig.self, from: data)
}
public func emailConfigFromJson(_ json: String) throws -> EmailConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(EmailConfig.self, from: data)
}
public func extractionConfigFromJson(_ json: String) throws -> ExtractionConfig {
return try RustBridge.extractionConfigFromJson(json)
}
public func fileExtractionConfigFromJson(_ json: String) throws -> FileExtractionConfig {
return try RustBridge.fileExtractionConfigFromJson(json)
}
public func batchBytesItemFromJson(_ json: String) throws -> BatchBytesItem {
return try RustBridge.batchBytesItemFromJson(json)
}
public func batchFileItemFromJson(_ json: String) throws -> BatchFileItem {
return try RustBridge.batchFileItemFromJson(json)
}
public func imageExtractionConfigFromJson(_ json: String) throws -> ImageExtractionConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImageExtractionConfig.self, from: data)
}
public func tokenReductionOptionsFromJson(_ json: String) throws -> TokenReductionOptions {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TokenReductionOptions.self, from: data)
}
public func languageDetectionConfigFromJson(_ json: String) throws -> LanguageDetectionConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LanguageDetectionConfig.self, from: data)
}
public func htmlOutputConfigFromJson(_ json: String) throws -> HtmlOutputConfig {
return try RustBridge.htmlOutputConfigFromJson(json)
}
public func layoutDetectionConfigFromJson(_ json: String) throws -> LayoutDetectionConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LayoutDetectionConfig.self, from: data)
}
public func llmConfigFromJson(_ json: String) throws -> LlmConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LlmConfig.self, from: data)
}
public func structuredExtractionConfigFromJson(_ json: String) throws -> StructuredExtractionConfig {
return try RustBridge.structuredExtractionConfigFromJson(json)
}
public func ocrQualityThresholdsFromJson(_ json: String) throws -> OcrQualityThresholds {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrQualityThresholds.self, from: data)
}
public func ocrPipelineStageFromJson(_ json: String) throws -> OcrPipelineStage {
return try RustBridge.ocrPipelineStageFromJson(json)
}
public func ocrPipelineConfigFromJson(_ json: String) throws -> OcrPipelineConfig {
return try RustBridge.ocrPipelineConfigFromJson(json)
}
public func ocrConfigFromJson(_ json: String) throws -> OcrConfig {
return try RustBridge.ocrConfigFromJson(json)
}
public func pageConfigFromJson(_ json: String) throws -> PageConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageConfig.self, from: data)
}
public func pdfConfigFromJson(_ json: String) throws -> PdfConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PdfConfig.self, from: data)
}
public func hierarchyConfigFromJson(_ json: String) throws -> HierarchyConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HierarchyConfig.self, from: data)
}
public func postProcessorConfigFromJson(_ json: String) throws -> PostProcessorConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PostProcessorConfig.self, from: data)
}
public func chunkingConfigFromJson(_ json: String) throws -> ChunkingConfig {
return try RustBridge.chunkingConfigFromJson(json)
}
public func embeddingConfigFromJson(_ json: String) throws -> EmbeddingConfig {
return try RustBridge.embeddingConfigFromJson(json)
}
public func treeSitterConfigFromJson(_ json: String) throws -> TreeSitterConfig {
return try RustBridge.treeSitterConfigFromJson(json)
}
public func treeSitterProcessConfigFromJson(_ json: String) throws -> TreeSitterProcessConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TreeSitterProcessConfig.self, from: data)
}
public func supportedFormatFromJson(_ json: String) throws -> SupportedFormat {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(SupportedFormat.self, from: data)
}
public func serverConfigFromJson(_ json: String) throws -> ServerConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ServerConfig.self, from: data)
}
public func structuredDataResultFromJson(_ json: String) throws -> StructuredDataResult {
return try RustBridge.structuredDataResultFromJson(json)
}
public func docxAppPropertiesFromJson(_ json: String) throws -> DocxAppProperties {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DocxAppProperties.self, from: data)
}
public func xlsxAppPropertiesFromJson(_ json: String) throws -> XlsxAppProperties {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(XlsxAppProperties.self, from: data)
}
public func pptxAppPropertiesFromJson(_ json: String) throws -> PptxAppProperties {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PptxAppProperties.self, from: data)
}
public func corePropertiesFromJson(_ json: String) throws -> CoreProperties {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CoreProperties.self, from: data)
}
public func securityLimitsFromJson(_ json: String) throws -> SecurityLimits {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(SecurityLimits.self, from: data)
}
public func tokenReductionConfigFromJson(_ json: String) throws -> TokenReductionConfig {
return try RustBridge.tokenReductionConfigFromJson(json)
}
public func pdfAnnotationFromJson(_ json: String) throws -> PdfAnnotation {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PdfAnnotation.self, from: data)
}
public func djotContentFromJson(_ json: String) throws -> DjotContent {
return try RustBridge.djotContentFromJson(json)
}
public func formattedBlockFromJson(_ json: String) throws -> FormattedBlock {
return try RustBridge.formattedBlockFromJson(json)
}
public func inlineElementFromJson(_ json: String) throws -> InlineElement {
return try RustBridge.inlineElementFromJson(json)
}
public func djotImageFromJson(_ json: String) throws -> DjotImage {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DjotImage.self, from: data)
}
public func djotLinkFromJson(_ json: String) throws -> DjotLink {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DjotLink.self, from: data)
}
public func footnoteFromJson(_ json: String) throws -> Footnote {
return try RustBridge.footnoteFromJson(json)
}
public func documentStructureFromJson(_ json: String) throws -> DocumentStructure {
return try RustBridge.documentStructureFromJson(json)
}
public func documentRelationshipFromJson(_ json: String) throws -> DocumentRelationship {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DocumentRelationship.self, from: data)
}
public func documentNodeFromJson(_ json: String) throws -> DocumentNode {
return try RustBridge.documentNodeFromJson(json)
}
public func tableGridFromJson(_ json: String) throws -> TableGrid {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TableGrid.self, from: data)
}
public func gridCellFromJson(_ json: String) throws -> GridCell {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(GridCell.self, from: data)
}
public func textAnnotationFromJson(_ json: String) throws -> TextAnnotation {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TextAnnotation.self, from: data)
}
public func extractionResultFromJson(_ json: String) throws -> ExtractionResult {
return try RustBridge.extractionResultFromJson(json)
}
public func archiveEntryFromJson(_ json: String) throws -> ArchiveEntry {
return try RustBridge.archiveEntryFromJson(json)
}
public func processingWarningFromJson(_ json: String) throws -> ProcessingWarning {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ProcessingWarning.self, from: data)
}
public func llmUsageFromJson(_ json: String) throws -> LlmUsage {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LlmUsage.self, from: data)
}
public func chunkFromJson(_ json: String) throws -> Chunk {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(Chunk.self, from: data)
}
public func headingContextFromJson(_ json: String) throws -> HeadingContext {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HeadingContext.self, from: data)
}
public func headingLevelFromJson(_ json: String) throws -> HeadingLevel {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HeadingLevel.self, from: data)
}
public func chunkMetadataFromJson(_ json: String) throws -> ChunkMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ChunkMetadata.self, from: data)
}
public func extractedImageFromJson(_ json: String) throws -> ExtractedImage {
return try RustBridge.extractedImageFromJson(json)
}
public func boundingBoxFromJson(_ json: String) throws -> BoundingBox {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(BoundingBox.self, from: data)
}
public func elementMetadataFromJson(_ json: String) throws -> ElementMetadata {
return try RustBridge.elementMetadataFromJson(json)
}
public func elementFromJson(_ json: String) throws -> Element {
return try RustBridge.elementFromJson(json)
}
public func excelWorkbookFromJson(_ json: String) throws -> ExcelWorkbook {
return try RustBridge.excelWorkbookFromJson(json)
}
public func excelSheetFromJson(_ json: String) throws -> ExcelSheet {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ExcelSheet.self, from: data)
}
public func xmlExtractionResultFromJson(_ json: String) throws -> XmlExtractionResult {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(XmlExtractionResult.self, from: data)
}
public func textExtractionResultFromJson(_ json: String) throws -> TextExtractionResult {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TextExtractionResult.self, from: data)
}
public func pptxExtractionResultFromJson(_ json: String) throws -> PptxExtractionResult {
return try RustBridge.pptxExtractionResultFromJson(json)
}
public func emailExtractionResultFromJson(_ json: String) throws -> EmailExtractionResult {
return try RustBridge.emailExtractionResultFromJson(json)
}
public func emailAttachmentFromJson(_ json: String) throws -> EmailAttachment {
return try RustBridge.emailAttachmentFromJson(json)
}
public func ocrExtractionResultFromJson(_ json: String) throws -> OcrExtractionResult {
return try RustBridge.ocrExtractionResultFromJson(json)
}
public func ocrTableFromJson(_ json: String) throws -> OcrTable {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrTable.self, from: data)
}
public func ocrTableBoundingBoxFromJson(_ json: String) throws -> OcrTableBoundingBox {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrTableBoundingBox.self, from: data)
}
public func imagePreprocessingConfigFromJson(_ json: String) throws -> ImagePreprocessingConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImagePreprocessingConfig.self, from: data)
}
public func tesseractConfigFromJson(_ json: String) throws -> TesseractConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TesseractConfig.self, from: data)
}
public func imagePreprocessingMetadataFromJson(_ json: String) throws -> ImagePreprocessingMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImagePreprocessingMetadata.self, from: data)
}
public func metadataFromJson(_ json: String) throws -> Metadata {
return try RustBridge.metadataFromJson(json)
}
public func excelMetadataFromJson(_ json: String) throws -> ExcelMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ExcelMetadata.self, from: data)
}
public func emailMetadataFromJson(_ json: String) throws -> EmailMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(EmailMetadata.self, from: data)
}
public func archiveMetadataFromJson(_ json: String) throws -> ArchiveMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ArchiveMetadata.self, from: data)
}
public func imageMetadataFromJson(_ json: String) throws -> ImageMetadata {
return try RustBridge.imageMetadataFromJson(json)
}
public func xmlMetadataFromJson(_ json: String) throws -> XmlMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(XmlMetadata.self, from: data)
}
public func textMetadataFromJson(_ json: String) throws -> TextMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TextMetadata.self, from: data)
}
public func headerMetadataFromJson(_ json: String) throws -> HeaderMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HeaderMetadata.self, from: data)
}
public func linkMetadataFromJson(_ json: String) throws -> LinkMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LinkMetadata.self, from: data)
}
public func imageMetadataTypeFromJson(_ json: String) throws -> ImageMetadataType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImageMetadataType.self, from: data)
}
public func structuredDataFromJson(_ json: String) throws -> StructuredData {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(StructuredData.self, from: data)
}
public func htmlMetadataFromJson(_ json: String) throws -> HtmlMetadata {
return try RustBridge.htmlMetadataFromJson(json)
}
public func ocrMetadataFromJson(_ json: String) throws -> OcrMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrMetadata.self, from: data)
}
public func errorMetadataFromJson(_ json: String) throws -> ErrorMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ErrorMetadata.self, from: data)
}
public func pptxMetadataFromJson(_ json: String) throws -> PptxMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PptxMetadata.self, from: data)
}
public func docxMetadataFromJson(_ json: String) throws -> DocxMetadata {
return try RustBridge.docxMetadataFromJson(json)
}
public func csvMetadataFromJson(_ json: String) throws -> CsvMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CsvMetadata.self, from: data)
}
public func bibtexMetadataFromJson(_ json: String) throws -> BibtexMetadata {
return try RustBridge.bibtexMetadataFromJson(json)
}
public func citationMetadataFromJson(_ json: String) throws -> CitationMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CitationMetadata.self, from: data)
}
public func yearRangeFromJson(_ json: String) throws -> YearRange {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(YearRange.self, from: data)
}
public func fictionBookMetadataFromJson(_ json: String) throws -> FictionBookMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(FictionBookMetadata.self, from: data)
}
public func dbfMetadataFromJson(_ json: String) throws -> DbfMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DbfMetadata.self, from: data)
}
public func dbfFieldInfoFromJson(_ json: String) throws -> DbfFieldInfo {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DbfFieldInfo.self, from: data)
}
public func jatsMetadataFromJson(_ json: String) throws -> JatsMetadata {
return try RustBridge.jatsMetadataFromJson(json)
}
public func contributorRoleFromJson(_ json: String) throws -> ContributorRole {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ContributorRole.self, from: data)
}
public func epubMetadataFromJson(_ json: String) throws -> EpubMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(EpubMetadata.self, from: data)
}
public func pstMetadataFromJson(_ json: String) throws -> PstMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PstMetadata.self, from: data)
}
public func ocrConfidenceFromJson(_ json: String) throws -> OcrConfidence {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrConfidence.self, from: data)
}
public func ocrRotationFromJson(_ json: String) throws -> OcrRotation {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrRotation.self, from: data)
}
public func ocrElementFromJson(_ json: String) throws -> OcrElement {
return try RustBridge.ocrElementFromJson(json)
}
public func ocrElementConfigFromJson(_ json: String) throws -> OcrElementConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrElementConfig.self, from: data)
}
public func pageStructureFromJson(_ json: String) throws -> PageStructure {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageStructure.self, from: data)
}
public func pageBoundaryFromJson(_ json: String) throws -> PageBoundary {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageBoundary.self, from: data)
}
public func pageInfoFromJson(_ json: String) throws -> PageInfo {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageInfo.self, from: data)
}
public func pageContentFromJson(_ json: String) throws -> PageContent {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageContent.self, from: data)
}
public func layoutRegionFromJson(_ json: String) throws -> LayoutRegion {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LayoutRegion.self, from: data)
}
public func pageHierarchyFromJson(_ json: String) throws -> PageHierarchy {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageHierarchy.self, from: data)
}
public func hierarchicalBlockFromJson(_ json: String) throws -> HierarchicalBlock {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HierarchicalBlock.self, from: data)
}
public func cellChangeFromJson(_ json: String) throws -> CellChange {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CellChange.self, from: data)
}
public func documentRevisionFromJson(_ json: String) throws -> DocumentRevision {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DocumentRevision.self, from: data)
}
public func revisionDeltaFromJson(_ json: String) throws -> RevisionDelta {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RevisionDelta.self, from: data)
}
public func tableFromJson(_ json: String) throws -> Table {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(Table.self, from: data)
}
public func tableCellFromJson(_ json: String) throws -> TableCell {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TableCell.self, from: data)
}
public func extractedUriFromJson(_ json: String) throws -> ExtractedUri {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ExtractedUri.self, from: data)
}
public func detectResponseFromJson(_ json: String) throws -> DetectResponse {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DetectResponse.self, from: data)
}
public func diffOptionsFromJson(_ json: String) throws -> DiffOptions {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DiffOptions.self, from: data)
}
public func extractionDiffFromJson(_ json: String) throws -> ExtractionDiff {
return try RustBridge.extractionDiffFromJson(json)
}
public func diffHunkFromJson(_ json: String) throws -> DiffHunk {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DiffHunk.self, from: data)
}
public func tableDiffFromJson(_ json: String) throws -> TableDiff {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TableDiff.self, from: data)
}
public func embeddedChangesFromJson(_ json: String) throws -> EmbeddedChanges {
return try RustBridge.embeddedChangesFromJson(json)
}
public func embeddedDiffFromJson(_ json: String) throws -> EmbeddedDiff {
return try RustBridge.embeddedDiffFromJson(json)
}
public func embeddingPresetFromJson(_ json: String) throws -> EmbeddingPreset {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(EmbeddingPreset.self, from: data)
}
public func yakeParamsFromJson(_ json: String) throws -> YakeParams {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(YakeParams.self, from: data)
}
public func rakeParamsFromJson(_ json: String) throws -> RakeParams {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RakeParams.self, from: data)
}
public func keywordConfigFromJson(_ json: String) throws -> KeywordConfig {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(KeywordConfig.self, from: data)
}
public func keywordFromJson(_ json: String) throws -> Keyword {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(Keyword.self, from: data)
}
public func paddleOcrConfigFromJson(_ json: String) throws -> PaddleOcrConfig {
return try RustBridge.paddleOcrConfigFromJson(json)
}
public func modelPathsFromJson(_ json: String) throws -> ModelPaths {
return try RustBridge.modelPathsFromJson(json)
}
public func orientationResultFromJson(_ json: String) throws -> OrientationResult {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OrientationResult.self, from: data)
}
public func bBoxFromJson(_ json: String) throws -> BBox {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(BBox.self, from: data)
}
public func layoutDetectionFromJson(_ json: String) throws -> LayoutDetection {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LayoutDetection.self, from: data)
}
public func recognizedTableFromJson(_ json: String) throws -> RecognizedTable {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RecognizedTable.self, from: data)
}
public func detectionResultFromJson(_ json: String) throws -> DetectionResult {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DetectionResult.self, from: data)
}
public func embeddedFileFromJson(_ json: String) throws -> EmbeddedFile {
return try RustBridge.embeddedFileFromJson(json)
}
public func pdfMetadataFromJson(_ json: String) throws -> PdfMetadata {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PdfMetadata.self, from: data)
}
public func executionProviderTypeFromJson(_ json: String) throws -> ExecutionProviderType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ExecutionProviderType.self, from: data)
}
public func outputFormatFromJson(_ json: String) throws -> OutputFormat {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OutputFormat.self, from: data)
}
public func htmlThemeFromJson(_ json: String) throws -> HtmlTheme {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(HtmlTheme.self, from: data)
}
public func tableModelFromJson(_ json: String) throws -> TableModel {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TableModel.self, from: data)
}
public func chunkerTypeFromJson(_ json: String) throws -> ChunkerType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ChunkerType.self, from: data)
}
public func chunkSizingFromJson(_ json: String) throws -> ChunkSizing {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ChunkSizing.self, from: data)
}
public func embeddingModelTypeFromJson(_ json: String) throws -> EmbeddingModelType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(EmbeddingModelType.self, from: data)
}
public func codeContentModeFromJson(_ json: String) throws -> CodeContentMode {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(CodeContentMode.self, from: data)
}
public func ocrBackendTypeFromJson(_ json: String) throws -> OcrBackendType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrBackendType.self, from: data)
}
public func processingStageFromJson(_ json: String) throws -> ProcessingStage {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ProcessingStage.self, from: data)
}
public func reductionLevelFromJson(_ json: String) throws -> ReductionLevel {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ReductionLevel.self, from: data)
}
public func pdfAnnotationTypeFromJson(_ json: String) throws -> PdfAnnotationType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PdfAnnotationType.self, from: data)
}
public func blockTypeFromJson(_ json: String) throws -> BlockType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(BlockType.self, from: data)
}
public func inlineTypeFromJson(_ json: String) throws -> InlineType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(InlineType.self, from: data)
}
public func relationshipKindFromJson(_ json: String) throws -> RelationshipKind {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RelationshipKind.self, from: data)
}
public func contentLayerFromJson(_ json: String) throws -> ContentLayer {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ContentLayer.self, from: data)
}
public func nodeContentFromJson(_ json: String) throws -> NodeContent {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(NodeContent.self, from: data)
}
public func annotationKindFromJson(_ json: String) throws -> AnnotationKind {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(AnnotationKind.self, from: data)
}
public func extractionMethodFromJson(_ json: String) throws -> ExtractionMethod {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ExtractionMethod.self, from: data)
}
public func chunkTypeFromJson(_ json: String) throws -> ChunkType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ChunkType.self, from: data)
}
public func imageKindFromJson(_ json: String) throws -> ImageKind {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImageKind.self, from: data)
}
public func resultFormatFromJson(_ json: String) throws -> ResultFormat {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ResultFormat.self, from: data)
}
public func elementTypeFromJson(_ json: String) throws -> ElementType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ElementType.self, from: data)
}
public func formatMetadataFromJson(_ json: String) throws -> FormatMetadata {
return try RustBridge.formatMetadataFromJson(json)
}
public func textDirectionFromJson(_ json: String) throws -> TextDirection {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(TextDirection.self, from: data)
}
public func linkTypeFromJson(_ json: String) throws -> LinkType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LinkType.self, from: data)
}
public func imageTypeFromJson(_ json: String) throws -> ImageType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(ImageType.self, from: data)
}
public func structuredDataTypeFromJson(_ json: String) throws -> StructuredDataType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(StructuredDataType.self, from: data)
}
public func ocrBoundingGeometryFromJson(_ json: String) throws -> OcrBoundingGeometry {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrBoundingGeometry.self, from: data)
}
public func ocrElementLevelFromJson(_ json: String) throws -> OcrElementLevel {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(OcrElementLevel.self, from: data)
}
public func pageUnitTypeFromJson(_ json: String) throws -> PageUnitType {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PageUnitType.self, from: data)
}
public func diffLineFromJson(_ json: String) throws -> DiffLine {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(DiffLine.self, from: data)
}
public func revisionKindFromJson(_ json: String) throws -> RevisionKind {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RevisionKind.self, from: data)
}
public func revisionAnchorFromJson(_ json: String) throws -> RevisionAnchor {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(RevisionAnchor.self, from: data)
}
public func uriKindFromJson(_ json: String) throws -> UriKind {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(UriKind.self, from: data)
}
public func keywordAlgorithmFromJson(_ json: String) throws -> KeywordAlgorithm {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(KeywordAlgorithm.self, from: data)
}
public func psmModeFromJson(_ json: String) throws -> PSMMode {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PSMMode.self, from: data)
}
public func paddleLanguageFromJson(_ json: String) throws -> PaddleLanguage {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(PaddleLanguage.self, from: data)
}
public func layoutClassFromJson(_ json: String) throws -> LayoutClass {
let data = json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode(LayoutClass.self, from: data)
}
// MARK: - Free-function Forwarders
// Re-export every public free function on the source Rust crate as a
// top-level `public func` on the host module so consumers do not need to
// `import RustBridge` directly. Forwarders take Swift-native parameter
// types and convert to the swift-bridge runtime types internally.
/// Synchronous wrapper for `extract_file`.
///
/// This is a convenience function that blocks the current thread until extraction completes.
/// For async code, use `extract_file` directly.
///
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
/// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
///
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
/// use a truly synchronous extraction approach instead.
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::extract_file_sync;
/// use kreuzberg::core::config::ExtractionConfig;
///
/// let config = ExtractionConfig::default();
/// let result = extract_file_sync("document.pdf", None, &config)?;
/// println!("Content: {}", result.content);
/// ```
public func extractFileSync(path: String, mimeType: String?, config: ExtractionConfig) throws -> ExtractionResult {
return try RustBridge.extractFileSync(path, mimeType, config)
}
/// Synchronous wrapper for `extract_bytes`.
///
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
/// a new runtime per call.
///
/// With the `tokio-runtime` feature, this blocks the current thread using the global
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::extract_bytes_sync;
/// use kreuzberg::core::config::ExtractionConfig;
///
/// let config = ExtractionConfig::default();
/// let bytes = b"Hello, world!";
/// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
/// println!("Content: {}", result.content);
/// ```
public func extractBytesSync(content: [UInt8], mimeType: String, config: ExtractionConfig) throws -> ExtractionResult {
let _rb_content: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in content { v.push(value: b) }; return v }()
return try RustBridge.extractBytesSync(_rb_content, mimeType, config)
}
/// Synchronous wrapper for `batch_extract_files`.
///
/// Uses the global Tokio runtime for optimal performance.
/// Only available with `tokio-runtime` (WASM has no filesystem).
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_files_sync;
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchFileItem {
/// path: "doc1.pdf".into(),
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
/// },
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
/// ];
/// let results = batch_extract_files_sync(items, &config)?;
/// ```
public func batchExtractFilesSync(items: [BatchFileItem], config: ExtractionConfig) throws -> [ExtractionResult] {
let _rb_items: RustVec<BatchFileItem> = { let v = RustVec<BatchFileItem>(); for x in items { v.push(value: x) }; return v }()
return try RustBridge.batchExtractFilesSync(_rb_items, config).map { ref in var item = try RustBridge.ExtractionResult(ptr: ref.ptr); item.isOwned = false; return item }
}
/// Synchronous wrapper for `batch_extract_bytes`.
///
/// Uses the global Tokio runtime for optimal performance.
/// With the `tokio-runtime` feature, this blocks the current thread using the global
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
/// that iterates through items and calls `extract_bytes_sync()`.
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_bytes_sync;
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
/// BatchBytesItem {
/// content: b"other".to_vec(),
/// mime_type: "text/plain".to_string(),
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
/// },
/// ];
/// let results = batch_extract_bytes_sync(items, &config)?;
/// ```
public func batchExtractBytesSync(items: [BatchBytesItem], config: ExtractionConfig) throws -> [ExtractionResult] {
let _rb_items: RustVec<BatchBytesItem> = { let v = RustVec<BatchBytesItem>(); for x in items { v.push(value: x) }; return v }()
return try RustBridge.batchExtractBytesSync(_rb_items, config).map { ref in var item = try RustBridge.ExtractionResult(ptr: ref.ptr); item.isOwned = false; return item }
}
/// Extract content from multiple files concurrently.
///
/// This function processes multiple files in parallel, automatically managing
/// concurrency to prevent resource exhaustion. The concurrency limit can be
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
/// to `(num_cpus * 1.5).ceil()`.
///
/// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
/// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
/// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
/// taken from the batch-level `config`.
///
/// # Arguments
///
/// * `items` - Vector of `BatchFileItem` structs, each containing a path and optional
/// per-file configuration overrides.
/// * `config` - Batch-level extraction configuration (provides defaults and batch settings)
///
/// # Returns
///
/// A vector of `ExtractionResult` in the same order as the input items.
///
/// # Errors
///
/// Individual file errors are captured in the result metadata. System errors
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
///
/// # Examples
///
/// Simple usage with no per-file overrides:
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_files;
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem};
/// use std::path::PathBuf;
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchFileItem { path: "doc1.pdf".into(), config: None },
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
/// ];
/// let results = batch_extract_files(items, &config).await?;
/// println!("Processed {} files", results.len());
/// ```
///
/// Per-file configuration overrides:
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_files;
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
/// use std::path::PathBuf;
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchFileItem {
/// path: "scan.pdf".into(),
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
/// },
/// BatchFileItem { path: "notes.txt".into(), config: None },
/// ];
/// let results = batch_extract_files(items, &config).await?;
/// ```
public func batchExtractFiles(items: [BatchFileItem], config: ExtractionConfig) async throws -> [ExtractionResult] {
let _rb_items: RustVec<BatchFileItem> = { let v = RustVec<BatchFileItem>(); for x in items { v.push(value: x) }; return v }()
return try await Task.detached(priority: .userInitiated) {
let result = try RustBridge.batchExtractFiles(_rb_items, config)
var items: [[ExtractionResult]] = []
for ref in result {
var item = try RustBridge.ExtractionResult(ptr: ref.ptr)
item.isOwned = false
items.append(item)
}
return items
}.value
}
/// Extract content from multiple byte arrays concurrently.
///
/// This function processes multiple byte arrays in parallel, automatically managing
/// concurrency to prevent resource exhaustion. The concurrency limit can be
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
/// to `(num_cpus * 1.5).ceil()`.
///
/// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
/// fields from the batch-level `config`. Pass `None` as the config to use
/// the batch-level defaults for that item.
///
/// # Arguments
///
/// * `items` - Vector of `BatchBytesItem` structs, each containing content bytes,
/// MIME type, and optional per-item configuration overrides.
/// * `config` - Batch-level extraction configuration
///
/// # Returns
///
/// A vector of `ExtractionResult` in the same order as the input items.
///
/// # Examples
///
/// Simple usage with no per-item overrides:
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_bytes;
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem};
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
/// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
/// ];
/// let results = batch_extract_bytes(items, &config).await?;
/// println!("Processed {} items", results.len());
/// ```
///
/// Per-item configuration overrides:
///
/// ```rust,no_run
/// use kreuzberg::core::extractor::batch_extract_bytes;
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
///
/// let config = ExtractionConfig::default();
/// let items = vec![
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
/// BatchBytesItem {
/// content: b"<html>test</html>".to_vec(),
/// mime_type: "text/html".to_string(),
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
/// },
/// ];
/// let results = batch_extract_bytes(items, &config).await?;
/// ```
public func batchExtractBytes(items: [BatchBytesItem], config: ExtractionConfig) async throws -> [ExtractionResult] {
let _rb_items: RustVec<BatchBytesItem> = { let v = RustVec<BatchBytesItem>(); for x in items { v.push(value: x) }; return v }()
return try await Task.detached(priority: .userInitiated) {
let result = try RustBridge.batchExtractBytes(_rb_items, config)
var items: [[ExtractionResult]] = []
for ref in result {
var item = try RustBridge.ExtractionResult(ptr: ref.ptr)
item.isOwned = false
items.append(item)
}
return items
}.value
}
/// Detect MIME type from raw file bytes.
///
/// Uses magic byte signatures to detect file type from content.
/// Falls back to `infer` crate for comprehensive detection.
///
/// For ZIP-based files, inspects contents to distinguish Office Open XML
/// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
///
/// # Arguments
///
/// * `content` - Raw file bytes
///
/// # Returns
///
/// The detected MIME type string.
///
/// # Errors
///
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
public func detectMimeTypeFromBytes(content: [UInt8]) throws -> String {
let _rb_content: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in content { v.push(value: b) }; return v }()
return try RustBridge.detectMimeTypeFromBytes(_rb_content).toString()
}
/// Get file extensions for a given MIME type.
///
/// Returns all known file extensions that map to the specified MIME type.
///
/// # Arguments
///
/// * `mime_type` - The MIME type to look up
///
/// # Returns
///
/// A vector of file extensions (without leading dot) for the MIME type.
///
/// # Example
///
/// ```
/// use kreuzberg::core::mime::get_extensions_for_mime;
///
/// let extensions = get_extensions_for_mime("application/pdf").unwrap();
/// assert_eq!(extensions, vec!["pdf"]);
///
/// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document").unwrap();
/// assert!(doc_extensions.contains(&"docx".to_string()));
/// ```
public func getExtensionsForMime(mimeType: String) throws -> [String] {
return try RustBridge.getExtensionsForMime(mimeType).map { $0.as_str().toString() }
}
/// List the names of all registered embedding backends.
///
/// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
/// bindings.
public func listEmbeddingBackends() throws -> [String] {
return try RustBridge.listEmbeddingBackends().map { $0.as_str().toString() }
}
/// List names of all registered document extractors.
public func listDocumentExtractors() throws -> [String] {
return try RustBridge.listDocumentExtractors().map { $0.as_str().toString() }
}
/// List all registered OCR backends.
///
/// Returns the names of all OCR backends currently registered in the global registry.
///
/// # Returns
///
/// A vector of OCR backend names.
///
/// # Example
///
/// ```rust
/// use kreuzberg::plugins::list_ocr_backends;
///
/// let backends = list_ocr_backends()?;
/// for name in backends {
/// println!("Registered OCR backend: {}", name);
/// }
/// ```
public func listOcrBackends() throws -> [String] {
return try RustBridge.listOcrBackends().map { $0.as_str().toString() }
}
/// List all registered post-processor names.
///
/// Returns a vector of all post-processor names currently registered in the
/// global registry.
///
/// # Returns
///
/// - `Ok(Vec<String>)` - Vector of post-processor names
/// - `Err(...)` if the registry lock is poisoned
///
/// # Example
///
/// ```rust
/// use kreuzberg::plugins::list_post_processors;
///
/// let processors = list_post_processors()?;
/// for name in processors {
/// println!("Registered post-processor: {}", name);
/// }
/// ```
public func listPostProcessors() throws -> [String] {
return try RustBridge.listPostProcessors().map { $0.as_str().toString() }
}
/// List names of all registered renderers.
///
/// # Errors
///
/// Returns an error if the registry lock is poisoned.
public func listRenderers() throws -> [String] {
return try RustBridge.listRenderers().map { $0.as_str().toString() }
}
/// List names of all registered validators.
public func listValidators() throws -> [String] {
return try RustBridge.listValidators().map { $0.as_str().toString() }
}
/// Compare two extraction results and return a structured diff.
///
/// The comparison is purely structural no I/O, no side effects. All fields
/// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
///
/// # Arguments
///
/// * `a` the "before" extraction result
/// * `b` the "after" extraction result
/// * `opts` controls which sections are compared and optional truncation
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
///
/// let mut a = ExtractionResult::default();
/// let mut b = ExtractionResult::default();
/// a.content = "Hello world".to_string();
/// b.content = "Hello Rust".to_string();
///
/// let diff = compare(&a, &b, &DiffOptions::default());
/// assert_eq!(diff.content_diff.len(), 1);
/// ```
public func compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) throws -> ExtractionDiff {
let _rb_opts = try opts.intoRust()
return RustBridge.compare(a, b, _rb_opts)
}
/// Generate embeddings asynchronously for a list of text strings.
///
/// This is the async counterpart to [`embed_texts`]. It offloads the blocking
/// ONNX inference work to a dedicated blocking thread pool via Tokio's
/// `spawn_blocking`, keeping the async executor free.
///
/// Returns one embedding vector per input text in the same order.
///
/// # Arguments
///
/// * `texts` - Vec of strings to embed (owned, sent to blocking thread)
/// * `config` - Embedding configuration specifying model, batch size, and normalization
///
/// # Errors
///
/// - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
/// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
/// or the blocking inference task panics
///
/// # Example
///
/// ```rust,ignore
/// use kreuzberg::{embed_texts_async, EmbeddingConfig};
///
/// let embeddings = embed_texts_async(
/// vec!["Hello!".to_string()],
/// &EmbeddingConfig::default(),
/// ).await?;
/// ```
public func embedTextsAsync(texts: [String], config: EmbeddingConfig) async throws -> [[Float]] {
let _rb_texts: RustVec<RustString> = { let v = RustVec<RustString>(); for s in texts { v.push(value: RustString(s)) }; return v }()
return try await Task.detached(priority: .userInitiated) {
let _rb_result = try RustBridge.embedTextsAsync(_rb_texts, config).toString()
let _rb_data = _rb_result.data(using: .utf8) ?? Data()
return try JSONDecoder().decode([[Float]].self, from: _rb_data)
}.value
}
/// Render a single PDF page to PNG bytes.
///
/// Returns raw PNG-encoded bytes for the specified page at the given DPI.
/// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
///
/// # Arguments
///
/// * `pdf_bytes` - Raw PDF file bytes
/// * `page_index` - Zero-based page index
/// * `dpi` - Resolution in dots per inch (default: 150)
/// * `password` - Optional password for encrypted PDFs
///
/// # Errors
///
/// Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
/// or rendered, or if `page_index` is out of range.
public func renderPdfPageToPng(pdfBytes: [UInt8], pageIndex: UInt, dpi: Int32?, password: String?) throws -> [UInt8] {
let _rb_pdfBytes: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in pdfBytes { v.push(value: b) }; return v }()
return try RustBridge.renderPdfPageToPng(_rb_pdfBytes, pageIndex, dpi, password).map { $0 }
}
/// Detect the MIME type of a file at the given path.
///
/// Uses the file extension and optionally the file content to determine the MIME type.
/// Set `check_exists` to `true` to verify the file exists before detection.
public func detectMimeType(path: String, checkExists: Bool) throws -> String {
return try RustBridge.detectMimeType(path, checkExists).toString()
}
/// Embed a list of texts using the configured embedding model.
///
/// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
public func embedTexts(texts: [String], config: EmbeddingConfig) throws -> [[Float]] {
let _rb_texts: RustVec<RustString> = { let v = RustVec<RustString>(); for s in texts { v.push(value: RustString(s)) }; return v }()
let _rb_json = try RustBridge.embedTexts(_rb_texts, config).toString()
let _rb_data = _rb_json.data(using: .utf8) ?? Data()
return try JSONDecoder().decode([[Float]].self, from: _rb_data)
}
/// Get an embedding preset by name.
///
/// Returns `None` if no preset with the given name exists. Returns an owned
/// clone so the value is safe to pass across FFI boundaries.
public func getEmbeddingPreset(name: String) throws -> EmbeddingPreset? {
return try RustBridge.getEmbeddingPreset(name).map { try EmbeddingPreset($0) }
}
/// List the names of all available embedding presets.
///
/// Returns owned `String`s so the values are safe to pass across FFI boundaries.
public func listEmbeddingPresets() -> [String] {
return RustBridge.listEmbeddingPresets().map { $0.as_str().toString() }
}
// MARK: - Trait Bridge Registration Forwarders
// Top-level `public func` re-exports of the swift-bridgegenerated
// `register_*` / `unregister_*` / `clear_*` plugin registration entry
// points so consumers do not need to `import RustBridge` for plugin work.
/// Register an inbound `OcrBackend` plugin implementation. The Swift
/// host wraps a `OcrBackend` conformer in a `SwiftOcrBackendBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerOcrBackend(_ swiftBox: SwiftOcrBackendBox) throws {
try RustBridge.registerOcrBackend(swiftBox)
}
/// Unregister a previously-registered `OcrBackend` plugin by name.
public func unregisterOcrBackend(_ name: String) throws {
try RustBridge.unregisterOcrBackend(name)
}
/// Remove every registered `OcrBackend` plugin. Typically used in test teardown.
public func clearOcrBackends() throws {
try RustBridge.clearOcrBackends()
}
/// Register an inbound `PostProcessor` plugin implementation. The Swift
/// host wraps a `PostProcessor` conformer in a `SwiftPostProcessorBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerPostProcessor(_ swiftBox: SwiftPostProcessorBox) throws {
try RustBridge.registerPostProcessor(swiftBox)
}
/// Unregister a previously-registered `PostProcessor` plugin by name.
public func unregisterPostProcessor(_ name: String) throws {
try RustBridge.unregisterPostProcessor(name)
}
/// Remove every registered `PostProcessor` plugin. Typically used in test teardown.
public func clearPostProcessors() throws {
try RustBridge.clearPostProcessors()
}
/// Register an inbound `Validator` plugin implementation. The Swift
/// host wraps a `Validator` conformer in a `SwiftValidatorBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerValidator(_ swiftBox: SwiftValidatorBox) throws {
try RustBridge.registerValidator(swiftBox)
}
/// Unregister a previously-registered `Validator` plugin by name.
public func unregisterValidator(_ name: String) throws {
try RustBridge.unregisterValidator(name)
}
/// Remove every registered `Validator` plugin. Typically used in test teardown.
public func clearValidators() throws {
try RustBridge.clearValidators()
}
/// Register an inbound `EmbeddingBackend` plugin implementation. The Swift
/// host wraps a `EmbeddingBackend` conformer in a `SwiftEmbeddingBackendBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerEmbeddingBackend(_ swiftBox: SwiftEmbeddingBackendBox) throws {
try RustBridge.registerEmbeddingBackend(swiftBox)
}
/// Unregister a previously-registered `EmbeddingBackend` plugin by name.
public func unregisterEmbeddingBackend(_ name: String) throws {
try RustBridge.unregisterEmbeddingBackend(name)
}
/// Remove every registered `EmbeddingBackend` plugin. Typically used in test teardown.
public func clearEmbeddingBackends() throws {
try RustBridge.clearEmbeddingBackends()
}
/// Register an inbound `DocumentExtractor` plugin implementation. The Swift
/// host wraps a `DocumentExtractor` conformer in a `SwiftDocumentExtractorBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerDocumentExtractor(_ swiftBox: SwiftDocumentExtractorBox) throws {
try RustBridge.registerDocumentExtractor(swiftBox)
}
/// Unregister a previously-registered `DocumentExtractor` plugin by name.
public func unregisterDocumentExtractor(_ name: String) throws {
try RustBridge.unregisterDocumentExtractor(name)
}
/// Remove every registered `DocumentExtractor` plugin. Typically used in test teardown.
public func clearDocumentExtractors() throws {
try RustBridge.clearDocumentExtractors()
}
/// Register an inbound `Renderer` plugin implementation. The Swift
/// host wraps a `Renderer` conformer in a `SwiftRendererBox` adapter
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
/// register the plugin in the global registry.
public func registerRenderer(_ swiftBox: SwiftRendererBox) throws {
try RustBridge.registerRenderer(swiftBox)
}
/// Unregister a previously-registered `Renderer` plugin by name.
public func unregisterRenderer(_ name: String) throws {
try RustBridge.unregisterRenderer(name)
}
/// Remove every registered `Renderer` plugin. Typically used in test teardown.
public func clearRenderers() throws {
try RustBridge.clearRenderers()
}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ContentFilterConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ExtractionConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ImageExtractionConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.TokenReductionOptions: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.LanguageDetectionConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.HtmlOutputConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.LayoutDetectionConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.OcrQualityThresholds: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.OcrConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.PageConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.PdfConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.HierarchyConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.PostProcessorConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ChunkingConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.EmbeddingConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.TreeSitterConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.TreeSitterProcessConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ServerConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.SecurityLimits: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.TokenReductionConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.DocumentStructure: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ExtractionResult: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ImagePreprocessingConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.TesseractConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.DiffOptions: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.ExtractionDiff: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.EmbeddingPreset: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.YakeParams: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.RakeParams: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.KeywordConfig: @unchecked Sendable {}
// swift-bridge opaque type used across Task.detached boundaries Rust type is Send + Sync.
extension RustBridge.PaddleOcrConfig: @unchecked Sendable {}