8478 lines
355 KiB
Swift
Generated
8478 lines
355 KiB
Swift
Generated
// Generated by alef. Do not edit by hand.
|
||
// swift-format-ignore-file
|
||
|
||
import Foundation
|
||
import RustBridge
|
||
|
||
public struct CacheStats: Codable, Sendable, Hashable {
|
||
public let totalFiles: UInt
|
||
public let totalSizeMb: Double
|
||
public let availableSpaceMb: Double
|
||
public let oldestFileAgeDays: Double
|
||
public let newestFileAgeDays: Double
|
||
public init(totalFiles: UInt, totalSizeMb: Double, availableSpaceMb: Double, oldestFileAgeDays: Double, newestFileAgeDays: Double) {
|
||
self.totalFiles = totalFiles
|
||
self.totalSizeMb = totalSizeMb
|
||
self.availableSpaceMb = availableSpaceMb
|
||
self.oldestFileAgeDays = oldestFileAgeDays
|
||
self.newestFileAgeDays = newestFileAgeDays
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case totalFiles = "total_files"
|
||
case totalSizeMb = "total_size_mb"
|
||
case availableSpaceMb = "available_space_mb"
|
||
case oldestFileAgeDays = "oldest_file_age_days"
|
||
case newestFileAgeDays = "newest_file_age_days"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for CacheStats
|
||
internal extension CacheStats {
|
||
init(_ rb: RustBridge.CacheStatsRef) throws {
|
||
self.totalFiles = rb.totalFiles()
|
||
self.totalSizeMb = rb.totalSizeMb()
|
||
self.availableSpaceMb = rb.availableSpaceMb()
|
||
self.oldestFileAgeDays = rb.oldestFileAgeDays()
|
||
self.newestFileAgeDays = rb.newestFileAgeDays()
|
||
}
|
||
func intoRust() throws -> RustBridge.CacheStats {
|
||
return RustBridge.CacheStats(self.totalFiles, self.totalSizeMb, self.availableSpaceMb, self.oldestFileAgeDays, self.newestFileAgeDays)
|
||
}
|
||
}
|
||
|
||
/// Hardware acceleration configuration for ONNX Runtime models.
|
||
///
|
||
/// Controls which execution provider (CPU, CoreML, CUDA, TensorRT) is used
|
||
/// for inference in layout detection and embedding generation.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::AccelerationConfig;
|
||
///
|
||
/// // Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere
|
||
/// let config = AccelerationConfig::default();
|
||
///
|
||
/// // Force CPU only
|
||
/// let config = AccelerationConfig {
|
||
/// provider: kreuzberg::ExecutionProviderType::Cpu,
|
||
/// ..Default::default()
|
||
/// };
|
||
/// ```
|
||
public struct AccelerationConfig: Codable, Sendable, Hashable {
|
||
/// Execution provider to use for ONNX inference.
|
||
public let provider: ExecutionProviderType
|
||
/// GPU device ID (for CUDA/TensorRT). Ignored for CPU/CoreML/Auto.
|
||
public let deviceId: UInt32
|
||
public init(provider: ExecutionProviderType, deviceId: UInt32) {
|
||
self.provider = provider
|
||
self.deviceId = deviceId
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case provider = "provider"
|
||
case deviceId = "device_id"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.provider = try container.decode(ExecutionProviderType.self, forKey: .provider)
|
||
self.deviceId = try container.decodeIfPresent(UInt32.self, forKey: .deviceId) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for AccelerationConfig
|
||
internal extension AccelerationConfig {
|
||
init(_ rb: RustBridge.AccelerationConfigRef) throws {
|
||
self.provider = ExecutionProviderType(rawValue: rb.provider().toString()) ?? { fatalError("Unknown ExecutionProviderType: \(rb.provider().toString())") }()
|
||
self.deviceId = rb.deviceId()
|
||
}
|
||
func intoRust() throws -> RustBridge.AccelerationConfig {
|
||
return RustBridge.AccelerationConfig(try self.provider.intoRust(), self.deviceId)
|
||
}
|
||
}
|
||
|
||
/// Cross-extractor content filtering configuration.
|
||
///
|
||
/// Controls whether "furniture" content (headers, footers, page numbers,
|
||
/// watermarks, repeating text) is included in or stripped from extraction
|
||
/// results. Applies across all extractors (PDF, DOCX, RTF, ODT, HTML, etc.)
|
||
/// with format-specific implementation.
|
||
///
|
||
/// When `None` on `ExtractionConfig`, each extractor uses its current
|
||
/// default behavior unchanged.
|
||
public struct ContentFilterConfig: Codable, Sendable, Hashable {
|
||
/// Include running headers in extraction output.
|
||
///
|
||
/// - PDF: Disables top-margin furniture stripping and prevents the layout
|
||
/// model from treating `PageHeader`-classified regions as furniture.
|
||
/// - DOCX: Includes document headers in text output.
|
||
/// - RTF/ODT: Headers already included; this is a no-op when true.
|
||
/// - HTML/EPUB: Keeps `<header>` element content.
|
||
///
|
||
/// Default: `false` (headers are stripped or excluded).
|
||
public let includeHeaders: Bool
|
||
/// Include running footers in extraction output.
|
||
///
|
||
/// - PDF: Disables bottom-margin furniture stripping and prevents the layout
|
||
/// model from treating `PageFooter`-classified regions as furniture.
|
||
/// - DOCX: Includes document footers in text output.
|
||
/// - RTF/ODT: Footers already included; this is a no-op when true.
|
||
/// - HTML/EPUB: Keeps `<footer>` element content.
|
||
///
|
||
/// Default: `false` (footers are stripped or excluded).
|
||
public let includeFooters: Bool
|
||
/// Enable the heuristic cross-page repeating text detector.
|
||
///
|
||
/// When `true` (default), text that repeats verbatim across a supermajority
|
||
/// of pages is classified as furniture and stripped. Disable this if brand
|
||
/// names or repeated headings are being incorrectly removed by the heuristic.
|
||
///
|
||
/// Note: when a layout-detection model is active, the model may independently
|
||
/// classify page-header / page-footer regions as furniture on a per-page basis.
|
||
/// To preserve those regions, set `include_headers = true`, `include_footers = true`,
|
||
/// or both, in addition to disabling this flag.
|
||
///
|
||
/// Primarily affects PDF extraction.
|
||
///
|
||
/// Default: `true`.
|
||
public let stripRepeatingText: Bool
|
||
/// Include watermark text in extraction output.
|
||
///
|
||
/// - PDF: Keeps watermark artifacts and arXiv identifiers.
|
||
/// - Other formats: No effect currently.
|
||
///
|
||
/// Default: `false` (watermarks are stripped).
|
||
public let includeWatermarks: Bool
|
||
public init(includeHeaders: Bool, includeFooters: Bool, stripRepeatingText: Bool, includeWatermarks: Bool) {
|
||
self.includeHeaders = includeHeaders
|
||
self.includeFooters = includeFooters
|
||
self.stripRepeatingText = stripRepeatingText
|
||
self.includeWatermarks = includeWatermarks
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case includeHeaders = "include_headers"
|
||
case includeFooters = "include_footers"
|
||
case stripRepeatingText = "strip_repeating_text"
|
||
case includeWatermarks = "include_watermarks"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.includeHeaders = try container.decodeIfPresent(Bool.self, forKey: .includeHeaders) ?? false
|
||
self.includeFooters = try container.decodeIfPresent(Bool.self, forKey: .includeFooters) ?? false
|
||
self.stripRepeatingText = try container.decodeIfPresent(Bool.self, forKey: .stripRepeatingText) ?? true
|
||
self.includeWatermarks = try container.decodeIfPresent(Bool.self, forKey: .includeWatermarks) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ContentFilterConfig
|
||
internal extension ContentFilterConfig {
|
||
init(_ rb: RustBridge.ContentFilterConfigRef) throws {
|
||
self.includeHeaders = rb.includeHeaders()
|
||
self.includeFooters = rb.includeFooters()
|
||
self.stripRepeatingText = rb.stripRepeatingText()
|
||
self.includeWatermarks = rb.includeWatermarks()
|
||
}
|
||
func intoRust() throws -> RustBridge.ContentFilterConfig {
|
||
return RustBridge.ContentFilterConfig(self.includeHeaders, self.includeFooters, self.stripRepeatingText, self.includeWatermarks)
|
||
}
|
||
}
|
||
|
||
/// Configuration for email extraction.
|
||
public struct EmailConfig: Codable, Sendable, Hashable {
|
||
/// Windows codepage number to use when an MSG file contains no codepage property.
|
||
/// Defaults to `None`, which falls back to windows-1252.
|
||
///
|
||
/// If an unrecognized or invalid codepage number is supplied (including 0),
|
||
/// the behavior silently falls back to windows-1252 — the same as when the
|
||
/// MSG file itself contains an unrecognized codepage. No error or warning is
|
||
/// emitted. Users should verify output when supplying unusual values.
|
||
///
|
||
/// Common values:
|
||
/// - 1250: Central European (Polish, Czech, Hungarian, etc.)
|
||
/// - 1251: Cyrillic (Russian, Ukrainian, Bulgarian, etc.)
|
||
/// - 1252: Western European (default)
|
||
/// - 1253: Greek
|
||
/// - 1254: Turkish
|
||
/// - 1255: Hebrew
|
||
/// - 1256: Arabic
|
||
/// - 932: Japanese (Shift-JIS)
|
||
/// - 936: Simplified Chinese (GBK)
|
||
public let msgFallbackCodepage: UInt32?
|
||
public init(msgFallbackCodepage: UInt32? = nil) {
|
||
self.msgFallbackCodepage = msgFallbackCodepage
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case msgFallbackCodepage = "msg_fallback_codepage"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.msgFallbackCodepage = try container.decodeIfPresent(UInt32.self, forKey: .msgFallbackCodepage) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for EmailConfig
|
||
internal extension EmailConfig {
|
||
init(_ rb: RustBridge.EmailConfigRef) throws {
|
||
self.msgFallbackCodepage = rb.msgFallbackCodepage()
|
||
}
|
||
func intoRust() throws -> RustBridge.EmailConfig {
|
||
return RustBridge.EmailConfig(self.msgFallbackCodepage)
|
||
}
|
||
}
|
||
|
||
/// Main extraction configuration.
|
||
///
|
||
/// This struct contains all configuration options for the extraction process.
|
||
/// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::core::config::ExtractionConfig;
|
||
///
|
||
/// // Create with defaults
|
||
/// let config = ExtractionConfig::default();
|
||
///
|
||
/// // Load from TOML file
|
||
/// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
||
/// ```
|
||
public typealias ExtractionConfig = RustBridge.ExtractionConfig
|
||
|
||
/// Per-file extraction configuration overrides for batch processing.
|
||
///
|
||
/// All fields are `Option<T>` — `None` means "use the batch-level default."
|
||
/// This type is used with `batch_extract_files` and
|
||
/// `batch_extract_bytes` to allow heterogeneous
|
||
/// extraction settings within a single batch.
|
||
///
|
||
/// # Excluded Fields
|
||
///
|
||
/// The following `ExtractionConfig` fields are batch-level only and
|
||
/// cannot be overridden per file:
|
||
/// - `max_concurrent_extractions` — controls batch parallelism
|
||
/// - `use_cache` — global caching policy
|
||
/// - `acceleration` — shared ONNX execution provider
|
||
/// - `security_limits` — global archive security policy
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::FileExtractionConfig;
|
||
///
|
||
/// // Override just OCR forcing for a specific file
|
||
/// let config = FileExtractionConfig {
|
||
/// force_ocr: Some(true),
|
||
/// ..Default::default()
|
||
/// };
|
||
/// ```
|
||
public typealias FileExtractionConfig = RustBridge.FileExtractionConfig
|
||
|
||
/// Batch item for byte array extraction.
|
||
///
|
||
/// Used with `batch_extract_bytes` and `batch_extract_bytes_sync`
|
||
/// to represent a single item in a batch extraction job.
|
||
public typealias BatchBytesItem = RustBridge.BatchBytesItem
|
||
|
||
/// Batch item for file extraction.
|
||
///
|
||
/// Used with `batch_extract_files` and `batch_extract_files_sync`
|
||
/// to represent a single file in a batch extraction job.
|
||
public typealias BatchFileItem = RustBridge.BatchFileItem
|
||
|
||
/// Image extraction configuration.
|
||
public struct ImageExtractionConfig: Codable, Sendable, Hashable {
|
||
/// Extract images from documents
|
||
public let extractImages: Bool
|
||
/// Target DPI for image normalization
|
||
public let targetDpi: Int32
|
||
/// Maximum dimension for images (width or height)
|
||
public let maxImageDimension: Int32
|
||
/// Whether to inject image reference placeholders into markdown output.
|
||
/// When `true` (default), image references like ``
|
||
/// are appended to the markdown. Set to `false` to extract images as data
|
||
/// without polluting the markdown output.
|
||
public let injectPlaceholders: Bool
|
||
/// Automatically adjust DPI based on image content
|
||
public let autoAdjustDpi: Bool
|
||
/// Minimum DPI threshold
|
||
public let minDpi: Int32
|
||
/// Maximum DPI threshold
|
||
public let maxDpi: Int32
|
||
/// Maximum number of image objects to extract per PDF page.
|
||
///
|
||
/// Some PDFs (e.g. technical diagrams stored as thousands of raster fragments)
|
||
/// can trigger extremely long or indefinite extraction times when every image
|
||
/// object on a dense page is decoded individually via the PDF extractor. Setting this
|
||
/// limit causes kreuzberg to stop collecting individual images once the count
|
||
/// per page reaches the cap and emit a warning instead.
|
||
///
|
||
/// `None` (default) means no limit — all images are extracted.
|
||
public let maxImagesPerPage: UInt32?
|
||
/// When `true` (default), extracted images are classified by kind and grouped
|
||
/// into clusters where they appear to belong to one figure.
|
||
public let classify: Bool
|
||
/// When `true`, full-page renders produced during OCR preprocessing are captured
|
||
/// and returned as `ImageKind::PageRaster` entries in `ExtractionResult.images`.
|
||
///
|
||
/// **PDF + OCR only.** No rasters are captured for non-PDF inputs or when the
|
||
/// document-level OCR bypass is active (whole-document backend). When OCR is
|
||
/// enabled and this flag is set but the active backend skips per-page rendering,
|
||
/// a `ProcessingWarning` is emitted in `ExtractionResult.processing_warnings`.
|
||
///
|
||
/// Defaults to `false`. Enable when downstream consumers need page thumbnails
|
||
/// (e.g. citation previews, visual grounding).
|
||
public let includePageRasters: Bool
|
||
/// Run OCR on extracted images and include the recognized text in the document content.
|
||
///
|
||
/// When `true` (default) and `ExtractionConfig.ocr` is configured, extracted images
|
||
/// are processed with the configured OCR backend. Set to `false` to extract images
|
||
/// without OCR processing, even when OCR is enabled.
|
||
public let runOcrOnImages: Bool
|
||
/// When `true`, image OCR results are rendered as plain text without the
|
||
/// `` markdown placeholder. Only takes effect when `run_ocr_on_images`
|
||
/// is also `true`.
|
||
public let ocrTextOnly: Bool
|
||
/// When `true` and `ocr_text_only` is `false`, append the OCR text after
|
||
/// the image placeholder in the rendered output.
|
||
public let appendOcrText: Bool
|
||
public init(extractImages: Bool, targetDpi: Int32, maxImageDimension: Int32, injectPlaceholders: Bool, autoAdjustDpi: Bool, minDpi: Int32, maxDpi: Int32, maxImagesPerPage: UInt32? = nil, classify: Bool, includePageRasters: Bool, runOcrOnImages: Bool, ocrTextOnly: Bool, appendOcrText: Bool) {
|
||
self.extractImages = extractImages
|
||
self.targetDpi = targetDpi
|
||
self.maxImageDimension = maxImageDimension
|
||
self.injectPlaceholders = injectPlaceholders
|
||
self.autoAdjustDpi = autoAdjustDpi
|
||
self.minDpi = minDpi
|
||
self.maxDpi = maxDpi
|
||
self.maxImagesPerPage = maxImagesPerPage
|
||
self.classify = classify
|
||
self.includePageRasters = includePageRasters
|
||
self.runOcrOnImages = runOcrOnImages
|
||
self.ocrTextOnly = ocrTextOnly
|
||
self.appendOcrText = appendOcrText
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case extractImages = "extract_images"
|
||
case targetDpi = "target_dpi"
|
||
case maxImageDimension = "max_image_dimension"
|
||
case injectPlaceholders = "inject_placeholders"
|
||
case autoAdjustDpi = "auto_adjust_dpi"
|
||
case minDpi = "min_dpi"
|
||
case maxDpi = "max_dpi"
|
||
case maxImagesPerPage = "max_images_per_page"
|
||
case classify = "classify"
|
||
case includePageRasters = "include_page_rasters"
|
||
case runOcrOnImages = "run_ocr_on_images"
|
||
case ocrTextOnly = "ocr_text_only"
|
||
case appendOcrText = "append_ocr_text"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.extractImages = try container.decodeIfPresent(Bool.self, forKey: .extractImages) ?? true
|
||
self.targetDpi = try container.decodeIfPresent(Int32.self, forKey: .targetDpi) ?? 300
|
||
self.maxImageDimension = try container.decodeIfPresent(Int32.self, forKey: .maxImageDimension) ?? 4096
|
||
self.injectPlaceholders = try container.decodeIfPresent(Bool.self, forKey: .injectPlaceholders) ?? true
|
||
self.autoAdjustDpi = try container.decodeIfPresent(Bool.self, forKey: .autoAdjustDpi) ?? true
|
||
self.minDpi = try container.decodeIfPresent(Int32.self, forKey: .minDpi) ?? 72
|
||
self.maxDpi = try container.decodeIfPresent(Int32.self, forKey: .maxDpi) ?? 600
|
||
self.maxImagesPerPage = try container.decodeIfPresent(UInt32.self, forKey: .maxImagesPerPage) ?? nil
|
||
self.classify = try container.decodeIfPresent(Bool.self, forKey: .classify) ?? true
|
||
self.includePageRasters = try container.decodeIfPresent(Bool.self, forKey: .includePageRasters) ?? false
|
||
self.runOcrOnImages = try container.decodeIfPresent(Bool.self, forKey: .runOcrOnImages) ?? true
|
||
self.ocrTextOnly = try container.decodeIfPresent(Bool.self, forKey: .ocrTextOnly) ?? false
|
||
self.appendOcrText = try container.decodeIfPresent(Bool.self, forKey: .appendOcrText) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ImageExtractionConfig
|
||
internal extension ImageExtractionConfig {
|
||
init(_ rb: RustBridge.ImageExtractionConfigRef) throws {
|
||
self.extractImages = rb.extractImages()
|
||
self.targetDpi = rb.targetDpi()
|
||
self.maxImageDimension = rb.maxImageDimension()
|
||
self.injectPlaceholders = rb.injectPlaceholders()
|
||
self.autoAdjustDpi = rb.autoAdjustDpi()
|
||
self.minDpi = rb.minDpi()
|
||
self.maxDpi = rb.maxDpi()
|
||
self.maxImagesPerPage = rb.maxImagesPerPage()
|
||
self.classify = rb.classify()
|
||
self.includePageRasters = rb.includePageRasters()
|
||
self.runOcrOnImages = rb.runOcrOnImages()
|
||
self.ocrTextOnly = rb.ocrTextOnly()
|
||
self.appendOcrText = rb.appendOcrText()
|
||
}
|
||
func intoRust() throws -> RustBridge.ImageExtractionConfig {
|
||
return RustBridge.ImageExtractionConfig(self.extractImages, self.targetDpi, self.maxImageDimension, self.injectPlaceholders, self.autoAdjustDpi, self.minDpi, self.maxDpi, self.maxImagesPerPage, self.classify, self.includePageRasters, self.runOcrOnImages, self.ocrTextOnly, self.appendOcrText)
|
||
}
|
||
}
|
||
|
||
/// Token reduction configuration.
|
||
public struct TokenReductionOptions: Codable, Sendable, Hashable {
|
||
/// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
|
||
public let mode: String
|
||
/// Preserve important words (capitalized, technical terms)
|
||
public let preserveImportantWords: Bool
|
||
public init(mode: String, preserveImportantWords: Bool) {
|
||
self.mode = mode
|
||
self.preserveImportantWords = preserveImportantWords
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case mode = "mode"
|
||
case preserveImportantWords = "preserve_important_words"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.mode = try container.decodeIfPresent(String.self, forKey: .mode) ?? ""
|
||
self.preserveImportantWords = try container.decodeIfPresent(Bool.self, forKey: .preserveImportantWords) ?? true
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TokenReductionOptions
|
||
internal extension TokenReductionOptions {
|
||
init(_ rb: RustBridge.TokenReductionOptionsRef) throws {
|
||
self.mode = rb.mode().toString()
|
||
self.preserveImportantWords = rb.preserveImportantWords()
|
||
}
|
||
func intoRust() throws -> RustBridge.TokenReductionOptions {
|
||
return RustBridge.TokenReductionOptions(RustString(self.mode), self.preserveImportantWords)
|
||
}
|
||
}
|
||
|
||
/// Language detection configuration.
|
||
public struct LanguageDetectionConfig: Codable, Sendable, Hashable {
|
||
/// Enable language detection
|
||
public let enabled: Bool
|
||
/// Minimum confidence threshold (0.0-1.0)
|
||
public let minConfidence: Double
|
||
/// Detect multiple languages in the document
|
||
public let detectMultiple: Bool
|
||
public init(enabled: Bool, minConfidence: Double, detectMultiple: Bool) {
|
||
self.enabled = enabled
|
||
self.minConfidence = minConfidence
|
||
self.detectMultiple = detectMultiple
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case enabled = "enabled"
|
||
case minConfidence = "min_confidence"
|
||
case detectMultiple = "detect_multiple"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
|
||
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0.8
|
||
self.detectMultiple = try container.decodeIfPresent(Bool.self, forKey: .detectMultiple) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LanguageDetectionConfig
|
||
internal extension LanguageDetectionConfig {
|
||
init(_ rb: RustBridge.LanguageDetectionConfigRef) throws {
|
||
self.enabled = rb.enabled()
|
||
self.minConfidence = rb.minConfidence()
|
||
self.detectMultiple = rb.detectMultiple()
|
||
}
|
||
func intoRust() throws -> RustBridge.LanguageDetectionConfig {
|
||
return RustBridge.LanguageDetectionConfig(self.enabled, self.minConfidence, self.detectMultiple)
|
||
}
|
||
}
|
||
|
||
/// Configuration for styled HTML output.
|
||
///
|
||
/// When set on [`ExtractionConfig::html_output`] alongside
|
||
/// `output_format = OutputFormat::Html`, the pipeline builds a
|
||
/// [`StyledHtmlRenderer`](crate::rendering::StyledHtmlRenderer) instead of
|
||
/// the plain comrak-based renderer.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::core::config::{HtmlOutputConfig, HtmlTheme};
|
||
///
|
||
/// let config = HtmlOutputConfig {
|
||
/// theme: HtmlTheme::GitHub,
|
||
/// css: Some(".kb-p { font-size: 1.1rem; }".to_string()),
|
||
/// ..Default::default()
|
||
/// };
|
||
/// ```
|
||
public typealias HtmlOutputConfig = RustBridge.HtmlOutputConfig
|
||
|
||
/// Layout detection configuration.
|
||
///
|
||
/// Controls layout detection behavior in the extraction pipeline.
|
||
/// When set on [`ExtractionConfig`](super::ExtractionConfig), layout detection
|
||
/// is enabled for PDF extraction.
|
||
public struct LayoutDetectionConfig: Codable, Sendable, Hashable {
|
||
/// Confidence threshold override (None = use model default).
|
||
public let confidenceThreshold: Float?
|
||
/// Whether to apply postprocessing heuristics (default: true).
|
||
public let applyHeuristics: Bool
|
||
/// Table structure recognition model.
|
||
///
|
||
/// Controls which model is used for table cell detection within layout-detected
|
||
/// table regions. Defaults to [`TableModel::Tatr`].
|
||
public let tableModel: TableModel
|
||
/// Hardware acceleration for ONNX models (layout detection + table structure).
|
||
///
|
||
/// When set, controls which execution provider (CPU, CUDA, CoreML, TensorRT)
|
||
/// is used for inference. Defaults to `None` (auto-select per platform).
|
||
public let acceleration: AccelerationConfig?
|
||
public init(confidenceThreshold: Float? = nil, applyHeuristics: Bool, tableModel: TableModel, acceleration: AccelerationConfig? = nil) {
|
||
self.confidenceThreshold = confidenceThreshold
|
||
self.applyHeuristics = applyHeuristics
|
||
self.tableModel = tableModel
|
||
self.acceleration = acceleration
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case confidenceThreshold = "confidence_threshold"
|
||
case applyHeuristics = "apply_heuristics"
|
||
case tableModel = "table_model"
|
||
case acceleration = "acceleration"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.confidenceThreshold = try container.decodeIfPresent(Float.self, forKey: .confidenceThreshold) ?? nil
|
||
self.applyHeuristics = try container.decodeIfPresent(Bool.self, forKey: .applyHeuristics) ?? true
|
||
self.tableModel = try container.decode(TableModel.self, forKey: .tableModel)
|
||
self.acceleration = try container.decodeIfPresent(AccelerationConfig.self, forKey: .acceleration) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LayoutDetectionConfig
|
||
internal extension LayoutDetectionConfig {
|
||
init(_ rb: RustBridge.LayoutDetectionConfigRef) throws {
|
||
self.confidenceThreshold = rb.confidenceThreshold()
|
||
self.applyHeuristics = rb.applyHeuristics()
|
||
self.tableModel = TableModel(rawValue: rb.tableModel().toString()) ?? { fatalError("Unknown TableModel: \(rb.tableModel().toString())") }()
|
||
self.acceleration = try rb.acceleration().map { try AccelerationConfig($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.LayoutDetectionConfig {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.layoutDetectionConfigFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Configuration for an LLM provider/model via liter-llm.
|
||
///
|
||
/// Each feature (VLM OCR, VLM embeddings, structured extraction) carries
|
||
/// its own `LlmConfig`, allowing different providers per feature.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```toml
|
||
/// [structured_extraction.llm]
|
||
/// model = "openai/gpt-4o"
|
||
/// api_key = "sk-..." # or use KREUZBERG_LLM_API_KEY env var
|
||
/// ```
|
||
public struct LlmConfig: Codable, Sendable, Hashable {
|
||
/// Provider/model string using liter-llm routing format.
|
||
///
|
||
/// Examples: `"openai/gpt-4o"`, `"anthropic/claude-sonnet-4-20250514"`,
|
||
/// `"groq/llama-3.1-70b-versatile"`.
|
||
public let model: String
|
||
/// API key for the provider. When `None`, liter-llm falls back to
|
||
/// the provider's standard environment variable (e.g., `OPENAI_API_KEY`).
|
||
public let apiKey: String?
|
||
/// Custom base URL override for the provider endpoint.
|
||
public let baseUrl: String?
|
||
/// Request timeout in seconds (default: 60).
|
||
public let timeoutSecs: UInt64?
|
||
/// Maximum retry attempts (default: 3).
|
||
public let maxRetries: UInt32?
|
||
/// Sampling temperature for generation tasks.
|
||
public let temperature: Double?
|
||
/// Maximum tokens to generate.
|
||
public let maxTokens: UInt64?
|
||
public init(model: String, apiKey: String? = nil, baseUrl: String? = nil, timeoutSecs: UInt64? = nil, maxRetries: UInt32? = nil, temperature: Double? = nil, maxTokens: UInt64? = nil) {
|
||
self.model = model
|
||
self.apiKey = apiKey
|
||
self.baseUrl = baseUrl
|
||
self.timeoutSecs = timeoutSecs
|
||
self.maxRetries = maxRetries
|
||
self.temperature = temperature
|
||
self.maxTokens = maxTokens
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case model = "model"
|
||
case apiKey = "api_key"
|
||
case baseUrl = "base_url"
|
||
case timeoutSecs = "timeout_secs"
|
||
case maxRetries = "max_retries"
|
||
case temperature = "temperature"
|
||
case maxTokens = "max_tokens"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.model = try container.decodeIfPresent(String.self, forKey: .model) ?? ""
|
||
self.apiKey = try container.decodeIfPresent(String.self, forKey: .apiKey) ?? nil
|
||
self.baseUrl = try container.decodeIfPresent(String.self, forKey: .baseUrl) ?? nil
|
||
self.timeoutSecs = try container.decodeIfPresent(UInt64.self, forKey: .timeoutSecs) ?? nil
|
||
self.maxRetries = try container.decodeIfPresent(UInt32.self, forKey: .maxRetries) ?? nil
|
||
self.temperature = try container.decodeIfPresent(Double.self, forKey: .temperature) ?? nil
|
||
self.maxTokens = try container.decodeIfPresent(UInt64.self, forKey: .maxTokens) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LlmConfig
|
||
internal extension LlmConfig {
|
||
init(_ rb: RustBridge.LlmConfigRef) throws {
|
||
self.model = rb.model().toString()
|
||
self.apiKey = rb.apiKey()?.toString()
|
||
self.baseUrl = rb.baseUrl()?.toString()
|
||
self.timeoutSecs = rb.timeoutSecs()
|
||
self.maxRetries = rb.maxRetries()
|
||
self.temperature = rb.temperature()
|
||
self.maxTokens = rb.maxTokens()
|
||
}
|
||
func intoRust() throws -> RustBridge.LlmConfig {
|
||
return RustBridge.LlmConfig(RustString(self.model), self.apiKey.map(RustString.init), self.baseUrl.map(RustString.init), self.timeoutSecs, self.maxRetries, self.temperature, self.maxTokens)
|
||
}
|
||
}
|
||
|
||
/// Configuration for LLM-based structured data extraction.
|
||
///
|
||
/// Sends extracted document content to a VLM with a JSON schema,
|
||
/// returning structured data that conforms to the schema.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```toml
|
||
/// [structured_extraction]
|
||
/// schema_name = "invoice_data"
|
||
/// strict = true
|
||
///
|
||
/// [structured_extraction.schema]
|
||
/// type = "object"
|
||
/// properties.vendor = { type = "string" }
|
||
/// properties.total = { type = "number" }
|
||
/// required = ["vendor", "total"]
|
||
///
|
||
/// [structured_extraction.llm]
|
||
/// model = "openai/gpt-4o"
|
||
/// ```
|
||
public typealias StructuredExtractionConfig = RustBridge.StructuredExtractionConfig
|
||
|
||
/// Quality thresholds for OCR fallback decisions and pipeline quality gating.
|
||
///
|
||
/// All fields default to the values that match the previous hardcoded behavior,
|
||
/// so `OcrQualityThresholds::default()` preserves existing semantics exactly.
|
||
public struct OcrQualityThresholds: Codable, Sendable, Hashable {
|
||
/// Minimum total non-whitespace characters to consider text substantive.
|
||
public let minTotalNonWhitespace: UInt
|
||
/// Minimum non-whitespace characters per page on average.
|
||
public let minNonWhitespacePerPage: Double
|
||
/// Minimum character count for a word to be "meaningful".
|
||
public let minMeaningfulWordLen: UInt
|
||
/// Minimum count of meaningful words before text is accepted.
|
||
public let minMeaningfulWords: UInt
|
||
/// Minimum alphanumeric ratio (non-whitespace chars that are alphanumeric).
|
||
public let minAlnumRatio: Double
|
||
/// Minimum Unicode replacement characters (U+FFFD) to trigger OCR fallback.
|
||
public let minGarbageChars: UInt
|
||
/// Maximum fraction of short (1-2 char) words before text is considered fragmented.
|
||
public let maxFragmentedWordRatio: Double
|
||
/// Critical fragmentation threshold — triggers OCR regardless of meaningful words.
|
||
/// Normal English text has ~20-30% short words. 80%+ is definitive garbage.
|
||
public let criticalFragmentedWordRatio: Double
|
||
/// Minimum average word length. Below this with enough words indicates garbled extraction.
|
||
public let minAvgWordLength: Double
|
||
/// Minimum word count before average word length check applies.
|
||
public let minWordsForAvgLengthCheck: UInt
|
||
/// Minimum consecutive word repetition ratio to detect column scrambling.
|
||
public let minConsecutiveRepeatRatio: Double
|
||
/// Minimum word count before consecutive repetition check is applied.
|
||
public let minWordsForRepeatCheck: UInt
|
||
/// Minimum character count for "substantive markdown" OCR skip gate.
|
||
public let substantiveMinChars: UInt
|
||
/// Minimum character count for "non-text content" OCR skip gate.
|
||
public let nonTextMinChars: UInt
|
||
/// Alphanumeric+whitespace ratio threshold for skip decisions.
|
||
public let alnumWsRatioThreshold: Double
|
||
/// Minimum quality score (0.0-1.0) for a pipeline stage result to be accepted.
|
||
/// If the result from a backend scores below this, try the next backend.
|
||
public let pipelineMinQuality: Double
|
||
public init(minTotalNonWhitespace: UInt, minNonWhitespacePerPage: Double, minMeaningfulWordLen: UInt, minMeaningfulWords: UInt, minAlnumRatio: Double, minGarbageChars: UInt, maxFragmentedWordRatio: Double, criticalFragmentedWordRatio: Double, minAvgWordLength: Double, minWordsForAvgLengthCheck: UInt, minConsecutiveRepeatRatio: Double, minWordsForRepeatCheck: UInt, substantiveMinChars: UInt, nonTextMinChars: UInt, alnumWsRatioThreshold: Double, pipelineMinQuality: Double) {
|
||
self.minTotalNonWhitespace = minTotalNonWhitespace
|
||
self.minNonWhitespacePerPage = minNonWhitespacePerPage
|
||
self.minMeaningfulWordLen = minMeaningfulWordLen
|
||
self.minMeaningfulWords = minMeaningfulWords
|
||
self.minAlnumRatio = minAlnumRatio
|
||
self.minGarbageChars = minGarbageChars
|
||
self.maxFragmentedWordRatio = maxFragmentedWordRatio
|
||
self.criticalFragmentedWordRatio = criticalFragmentedWordRatio
|
||
self.minAvgWordLength = minAvgWordLength
|
||
self.minWordsForAvgLengthCheck = minWordsForAvgLengthCheck
|
||
self.minConsecutiveRepeatRatio = minConsecutiveRepeatRatio
|
||
self.minWordsForRepeatCheck = minWordsForRepeatCheck
|
||
self.substantiveMinChars = substantiveMinChars
|
||
self.nonTextMinChars = nonTextMinChars
|
||
self.alnumWsRatioThreshold = alnumWsRatioThreshold
|
||
self.pipelineMinQuality = pipelineMinQuality
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case minTotalNonWhitespace = "min_total_non_whitespace"
|
||
case minNonWhitespacePerPage = "min_non_whitespace_per_page"
|
||
case minMeaningfulWordLen = "min_meaningful_word_len"
|
||
case minMeaningfulWords = "min_meaningful_words"
|
||
case minAlnumRatio = "min_alnum_ratio"
|
||
case minGarbageChars = "min_garbage_chars"
|
||
case maxFragmentedWordRatio = "max_fragmented_word_ratio"
|
||
case criticalFragmentedWordRatio = "critical_fragmented_word_ratio"
|
||
case minAvgWordLength = "min_avg_word_length"
|
||
case minWordsForAvgLengthCheck = "min_words_for_avg_length_check"
|
||
case minConsecutiveRepeatRatio = "min_consecutive_repeat_ratio"
|
||
case minWordsForRepeatCheck = "min_words_for_repeat_check"
|
||
case substantiveMinChars = "substantive_min_chars"
|
||
case nonTextMinChars = "non_text_min_chars"
|
||
case alnumWsRatioThreshold = "alnum_ws_ratio_threshold"
|
||
case pipelineMinQuality = "pipeline_min_quality"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.minTotalNonWhitespace = try container.decodeIfPresent(UInt.self, forKey: .minTotalNonWhitespace) ?? 64
|
||
self.minNonWhitespacePerPage = try container.decodeIfPresent(Double.self, forKey: .minNonWhitespacePerPage) ?? 32.0
|
||
self.minMeaningfulWordLen = try container.decodeIfPresent(UInt.self, forKey: .minMeaningfulWordLen) ?? 4
|
||
self.minMeaningfulWords = try container.decodeIfPresent(UInt.self, forKey: .minMeaningfulWords) ?? 3
|
||
self.minAlnumRatio = try container.decodeIfPresent(Double.self, forKey: .minAlnumRatio) ?? 0.3
|
||
self.minGarbageChars = try container.decodeIfPresent(UInt.self, forKey: .minGarbageChars) ?? 5
|
||
self.maxFragmentedWordRatio = try container.decodeIfPresent(Double.self, forKey: .maxFragmentedWordRatio) ?? 0.6
|
||
self.criticalFragmentedWordRatio = try container.decodeIfPresent(Double.self, forKey: .criticalFragmentedWordRatio) ?? 0.8
|
||
self.minAvgWordLength = try container.decodeIfPresent(Double.self, forKey: .minAvgWordLength) ?? 2.0
|
||
self.minWordsForAvgLengthCheck = try container.decodeIfPresent(UInt.self, forKey: .minWordsForAvgLengthCheck) ?? 50
|
||
self.minConsecutiveRepeatRatio = try container.decodeIfPresent(Double.self, forKey: .minConsecutiveRepeatRatio) ?? 0.08
|
||
self.minWordsForRepeatCheck = try container.decodeIfPresent(UInt.self, forKey: .minWordsForRepeatCheck) ?? 50
|
||
self.substantiveMinChars = try container.decodeIfPresent(UInt.self, forKey: .substantiveMinChars) ?? 100
|
||
self.nonTextMinChars = try container.decodeIfPresent(UInt.self, forKey: .nonTextMinChars) ?? 20
|
||
self.alnumWsRatioThreshold = try container.decodeIfPresent(Double.self, forKey: .alnumWsRatioThreshold) ?? 0.4
|
||
self.pipelineMinQuality = try container.decodeIfPresent(Double.self, forKey: .pipelineMinQuality) ?? 0.5
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrQualityThresholds
|
||
internal extension OcrQualityThresholds {
|
||
init(_ rb: RustBridge.OcrQualityThresholdsRef) throws {
|
||
self.minTotalNonWhitespace = rb.minTotalNonWhitespace()
|
||
self.minNonWhitespacePerPage = rb.minNonWhitespacePerPage()
|
||
self.minMeaningfulWordLen = rb.minMeaningfulWordLen()
|
||
self.minMeaningfulWords = rb.minMeaningfulWords()
|
||
self.minAlnumRatio = rb.minAlnumRatio()
|
||
self.minGarbageChars = rb.minGarbageChars()
|
||
self.maxFragmentedWordRatio = rb.maxFragmentedWordRatio()
|
||
self.criticalFragmentedWordRatio = rb.criticalFragmentedWordRatio()
|
||
self.minAvgWordLength = rb.minAvgWordLength()
|
||
self.minWordsForAvgLengthCheck = rb.minWordsForAvgLengthCheck()
|
||
self.minConsecutiveRepeatRatio = rb.minConsecutiveRepeatRatio()
|
||
self.minWordsForRepeatCheck = rb.minWordsForRepeatCheck()
|
||
self.substantiveMinChars = rb.substantiveMinChars()
|
||
self.nonTextMinChars = rb.nonTextMinChars()
|
||
self.alnumWsRatioThreshold = rb.alnumWsRatioThreshold()
|
||
self.pipelineMinQuality = rb.pipelineMinQuality()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrQualityThresholds {
|
||
return RustBridge.OcrQualityThresholds(self.minTotalNonWhitespace, self.minNonWhitespacePerPage, self.minMeaningfulWordLen, self.minMeaningfulWords, self.minAlnumRatio, self.minGarbageChars, self.maxFragmentedWordRatio, self.criticalFragmentedWordRatio, self.minAvgWordLength, self.minWordsForAvgLengthCheck, self.minConsecutiveRepeatRatio, self.minWordsForRepeatCheck, self.substantiveMinChars, self.nonTextMinChars, self.alnumWsRatioThreshold, self.pipelineMinQuality)
|
||
}
|
||
}
|
||
|
||
/// A single backend stage in the OCR pipeline.
|
||
public typealias OcrPipelineStage = RustBridge.OcrPipelineStage
|
||
|
||
/// Multi-backend OCR pipeline with quality-based fallback.
|
||
///
|
||
/// Backends are tried in priority order (highest first). After each backend
|
||
/// produces output, quality is evaluated. If it meets `quality_thresholds.pipeline_min_quality`,
|
||
/// the result is accepted. Otherwise the next backend is tried.
|
||
public typealias OcrPipelineConfig = RustBridge.OcrPipelineConfig
|
||
|
||
/// OCR configuration.
|
||
public typealias OcrConfig = RustBridge.OcrConfig
|
||
|
||
/// Page extraction and tracking configuration.
|
||
///
|
||
/// Controls how pages are extracted, tracked, and represented in the extraction results.
|
||
/// When `None`, page tracking is disabled.
|
||
///
|
||
/// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
|
||
/// when page boundaries are available and chunking is configured.
|
||
public struct PageConfig: Codable, Sendable, Hashable {
|
||
/// Extract pages as separate array (ExtractionResult.pages)
|
||
public let extractPages: Bool
|
||
/// Insert page markers in main content string
|
||
public let insertPageMarkers: Bool
|
||
/// Page marker format (use {page_num} placeholder)
|
||
/// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
|
||
public let markerFormat: String
|
||
public init(extractPages: Bool, insertPageMarkers: Bool, markerFormat: String) {
|
||
self.extractPages = extractPages
|
||
self.insertPageMarkers = insertPageMarkers
|
||
self.markerFormat = markerFormat
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case extractPages = "extract_pages"
|
||
case insertPageMarkers = "insert_page_markers"
|
||
case markerFormat = "marker_format"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.extractPages = try container.decodeIfPresent(Bool.self, forKey: .extractPages) ?? false
|
||
self.insertPageMarkers = try container.decodeIfPresent(Bool.self, forKey: .insertPageMarkers) ?? false
|
||
self.markerFormat = try container.decodeIfPresent(String.self, forKey: .markerFormat) ?? "\n\n<!-- PAGE {page_num} -->\n\n"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageConfig
|
||
internal extension PageConfig {
|
||
init(_ rb: RustBridge.PageConfigRef) throws {
|
||
self.extractPages = rb.extractPages()
|
||
self.insertPageMarkers = rb.insertPageMarkers()
|
||
self.markerFormat = rb.markerFormat().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.PageConfig {
|
||
return RustBridge.PageConfig(self.extractPages, self.insertPageMarkers, RustString(self.markerFormat))
|
||
}
|
||
}
|
||
|
||
/// PDF-specific configuration.
|
||
public struct PdfConfig: Codable, Sendable, Hashable {
|
||
/// Extract images from PDF
|
||
public let extractImages: Bool
|
||
/// Extract tables from PDF.
|
||
///
|
||
/// When `true` (default), runs pdf_oxide's native grid detector and, if it
|
||
/// finds nothing, falls back to the heuristic text-layer reconstruction in
|
||
/// `pdf::oxide::table::extract_tables_heuristic`. Set to `false` to skip
|
||
/// both passes — `tables` will then be empty in the result.
|
||
public let extractTables: Bool
|
||
/// List of passwords to try when opening encrypted PDFs
|
||
public let passwords: [String]?
|
||
/// Extract PDF metadata
|
||
public let extractMetadata: Bool
|
||
/// Hierarchy extraction configuration (None = hierarchy extraction disabled)
|
||
public let hierarchy: HierarchyConfig?
|
||
/// Extract PDF annotations (text notes, highlights, links, stamps).
|
||
/// Default: false
|
||
public let extractAnnotations: Bool
|
||
/// Top margin fraction (0.0–1.0) of page height to exclude headers/running heads.
|
||
/// Default: 0.06 (6%)
|
||
public let topMarginFraction: Float?
|
||
/// Bottom margin fraction (0.0–1.0) of page height to exclude footers/page numbers.
|
||
/// Default: 0.05 (5%)
|
||
public let bottomMarginFraction: Float?
|
||
/// Allow single-column pseudo tables in extraction results.
|
||
///
|
||
/// By default, tables with fewer than 2 columns (layout-guided) or 3 columns
|
||
/// (heuristic) are rejected. When `true`, the minimum column count is relaxed
|
||
/// to 1, allowing single-column structured data (glossaries, itemized lists)
|
||
/// to be emitted as tables. Other quality filters (density, sparsity, prose
|
||
/// detection) still apply.
|
||
public let allowSingleColumnTables: Bool
|
||
/// Perform OCR on inline images extracted from PDF pages and attach the
|
||
/// recognized text to each `ExtractedImage.ocr_result`. Requires Tesseract
|
||
/// to be available; if `ExtractionConfig.ocr` is `None` the extractor
|
||
/// falls back to `TesseractConfig::default()`. Per-image failures degrade
|
||
/// gracefully (the image is returned without OCR text rather than failing
|
||
/// the whole extraction). Default: `false`.
|
||
public let ocrInlineImages: Bool
|
||
public init(extractImages: Bool, extractTables: Bool, passwords: [String]? = nil, extractMetadata: Bool, hierarchy: HierarchyConfig? = nil, extractAnnotations: Bool, topMarginFraction: Float? = nil, bottomMarginFraction: Float? = nil, allowSingleColumnTables: Bool, ocrInlineImages: Bool) {
|
||
self.extractImages = extractImages
|
||
self.extractTables = extractTables
|
||
self.passwords = passwords
|
||
self.extractMetadata = extractMetadata
|
||
self.hierarchy = hierarchy
|
||
self.extractAnnotations = extractAnnotations
|
||
self.topMarginFraction = topMarginFraction
|
||
self.bottomMarginFraction = bottomMarginFraction
|
||
self.allowSingleColumnTables = allowSingleColumnTables
|
||
self.ocrInlineImages = ocrInlineImages
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case extractImages = "extract_images"
|
||
case extractTables = "extract_tables"
|
||
case passwords = "passwords"
|
||
case extractMetadata = "extract_metadata"
|
||
case hierarchy = "hierarchy"
|
||
case extractAnnotations = "extract_annotations"
|
||
case topMarginFraction = "top_margin_fraction"
|
||
case bottomMarginFraction = "bottom_margin_fraction"
|
||
case allowSingleColumnTables = "allow_single_column_tables"
|
||
case ocrInlineImages = "ocr_inline_images"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.extractImages = try container.decodeIfPresent(Bool.self, forKey: .extractImages) ?? false
|
||
self.extractTables = try container.decodeIfPresent(Bool.self, forKey: .extractTables) ?? true
|
||
self.passwords = try container.decodeIfPresent([String].self, forKey: .passwords) ?? nil
|
||
self.extractMetadata = try container.decodeIfPresent(Bool.self, forKey: .extractMetadata) ?? true
|
||
self.hierarchy = try container.decodeIfPresent(HierarchyConfig.self, forKey: .hierarchy) ?? nil
|
||
self.extractAnnotations = try container.decodeIfPresent(Bool.self, forKey: .extractAnnotations) ?? false
|
||
self.topMarginFraction = try container.decodeIfPresent(Float.self, forKey: .topMarginFraction) ?? nil
|
||
self.bottomMarginFraction = try container.decodeIfPresent(Float.self, forKey: .bottomMarginFraction) ?? nil
|
||
self.allowSingleColumnTables = try container.decodeIfPresent(Bool.self, forKey: .allowSingleColumnTables) ?? false
|
||
self.ocrInlineImages = try container.decodeIfPresent(Bool.self, forKey: .ocrInlineImages) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PdfConfig
|
||
internal extension PdfConfig {
|
||
init(_ rb: RustBridge.PdfConfigRef) throws {
|
||
self.extractImages = rb.extractImages()
|
||
self.extractTables = rb.extractTables()
|
||
self.passwords = rb.passwords()?.map { $0.as_str().toString() }
|
||
self.extractMetadata = rb.extractMetadata()
|
||
self.hierarchy = try rb.hierarchy().map { try HierarchyConfig($0) }
|
||
self.extractAnnotations = rb.extractAnnotations()
|
||
self.topMarginFraction = rb.topMarginFraction()
|
||
self.bottomMarginFraction = rb.bottomMarginFraction()
|
||
self.allowSingleColumnTables = rb.allowSingleColumnTables()
|
||
self.ocrInlineImages = rb.ocrInlineImages()
|
||
}
|
||
func intoRust() throws -> RustBridge.PdfConfig {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pdfConfigFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Hierarchy extraction configuration for PDF text structure analysis.
|
||
///
|
||
/// Enables extraction of document hierarchy levels (H1-H6) based on font size
|
||
/// clustering and semantic analysis. When enabled, hierarchical blocks are
|
||
/// included in page content.
|
||
public struct HierarchyConfig: Codable, Sendable, Hashable {
|
||
/// Enable hierarchy extraction
|
||
public let enabled: Bool
|
||
/// Number of font size clusters to use for hierarchy levels (1-7)
|
||
///
|
||
/// Default: 6, which provides H1-H6 heading levels with body text.
|
||
/// Larger values create more fine-grained hierarchy levels.
|
||
public let kClusters: UInt
|
||
/// Include bounding box information in hierarchy blocks
|
||
public let includeBbox: Bool
|
||
/// OCR coverage threshold for smart OCR triggering (0.0-1.0)
|
||
///
|
||
/// Determines when OCR should be triggered based on text block coverage.
|
||
/// OCR is triggered when text blocks cover less than this fraction of the page.
|
||
/// Default: 0.5 (trigger OCR if less than 50% of page has text)
|
||
public let ocrCoverageThreshold: Float?
|
||
public init(enabled: Bool, kClusters: UInt, includeBbox: Bool, ocrCoverageThreshold: Float? = nil) {
|
||
self.enabled = enabled
|
||
self.kClusters = kClusters
|
||
self.includeBbox = includeBbox
|
||
self.ocrCoverageThreshold = ocrCoverageThreshold
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case enabled = "enabled"
|
||
case kClusters = "k_clusters"
|
||
case includeBbox = "include_bbox"
|
||
case ocrCoverageThreshold = "ocr_coverage_threshold"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
|
||
self.kClusters = try container.decodeIfPresent(UInt.self, forKey: .kClusters) ?? 3
|
||
self.includeBbox = try container.decodeIfPresent(Bool.self, forKey: .includeBbox) ?? true
|
||
self.ocrCoverageThreshold = try container.decodeIfPresent(Float.self, forKey: .ocrCoverageThreshold) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for HierarchyConfig
|
||
internal extension HierarchyConfig {
|
||
init(_ rb: RustBridge.HierarchyConfigRef) throws {
|
||
self.enabled = rb.enabled()
|
||
self.kClusters = rb.kClusters()
|
||
self.includeBbox = rb.includeBbox()
|
||
self.ocrCoverageThreshold = rb.ocrCoverageThreshold()
|
||
}
|
||
func intoRust() throws -> RustBridge.HierarchyConfig {
|
||
return RustBridge.HierarchyConfig(self.enabled, self.kClusters, self.includeBbox, self.ocrCoverageThreshold)
|
||
}
|
||
}
|
||
|
||
/// Post-processor configuration.
|
||
public struct PostProcessorConfig: Codable, Sendable, Hashable {
|
||
/// Enable post-processors
|
||
public let enabled: Bool
|
||
/// Whitelist of processor names to run (None = all enabled)
|
||
public let enabledProcessors: [String]?
|
||
/// Blacklist of processor names to skip (None = none disabled)
|
||
public let disabledProcessors: [String]?
|
||
/// Pre-computed AHashSet for O(1) enabled processor lookup
|
||
public let enabledSet: [String]?
|
||
/// Pre-computed AHashSet for O(1) disabled processor lookup
|
||
public let disabledSet: [String]?
|
||
public init(enabled: Bool, enabledProcessors: [String]? = nil, disabledProcessors: [String]? = nil, enabledSet: [String]? = nil, disabledSet: [String]? = nil) {
|
||
self.enabled = enabled
|
||
self.enabledProcessors = enabledProcessors
|
||
self.disabledProcessors = disabledProcessors
|
||
self.enabledSet = enabledSet
|
||
self.disabledSet = disabledSet
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case enabled = "enabled"
|
||
case enabledProcessors = "enabled_processors"
|
||
case disabledProcessors = "disabled_processors"
|
||
case enabledSet = "enabled_set"
|
||
case disabledSet = "disabled_set"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.enabled = try container.decodeIfPresent(Bool.self, forKey: .enabled) ?? true
|
||
self.enabledProcessors = try container.decodeIfPresent([String].self, forKey: .enabledProcessors) ?? nil
|
||
self.disabledProcessors = try container.decodeIfPresent([String].self, forKey: .disabledProcessors) ?? nil
|
||
self.enabledSet = try container.decodeIfPresent([String].self, forKey: .enabledSet) ?? nil
|
||
self.disabledSet = try container.decodeIfPresent([String].self, forKey: .disabledSet) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PostProcessorConfig
|
||
internal extension PostProcessorConfig {
|
||
init(_ rb: RustBridge.PostProcessorConfigRef) throws {
|
||
self.enabled = rb.enabled()
|
||
self.enabledProcessors = rb.enabledProcessors()?.map { $0.as_str().toString() }
|
||
self.disabledProcessors = rb.disabledProcessors()?.map { $0.as_str().toString() }
|
||
self.enabledSet = rb.enabledSet()?.map { $0.as_str().toString() }
|
||
self.disabledSet = rb.disabledSet()?.map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.PostProcessorConfig {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.postProcessorConfigFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Chunking configuration.
|
||
///
|
||
/// Configures text chunking for document content, including chunk size,
|
||
/// overlap, trimming behavior, and optional embeddings.
|
||
///
|
||
/// Use `..Default::default()` when constructing to allow for future field additions:
|
||
/// ```rust
|
||
/// let config = ChunkingConfig {
|
||
/// max_characters: 500,
|
||
/// ..Default::default()
|
||
/// };
|
||
/// ```
|
||
public typealias ChunkingConfig = RustBridge.ChunkingConfig
|
||
|
||
/// Embedding configuration for text chunks.
|
||
///
|
||
/// Configures embedding generation using ONNX models via the vendored embedding engine.
|
||
/// Requires the `embeddings` feature to be enabled.
|
||
public typealias EmbeddingConfig = RustBridge.EmbeddingConfig
|
||
|
||
/// Configuration for tree-sitter language pack integration.
|
||
///
|
||
/// Controls grammar download behavior and code analysis options.
|
||
///
|
||
/// # Example (TOML)
|
||
///
|
||
/// ```toml
|
||
/// [tree_sitter]
|
||
/// languages = ["python", "rust"]
|
||
/// groups = ["web"]
|
||
///
|
||
/// [tree_sitter.process]
|
||
/// structure = true
|
||
/// comments = true
|
||
/// docstrings = true
|
||
/// ```
|
||
public typealias TreeSitterConfig = RustBridge.TreeSitterConfig
|
||
|
||
/// Processing options for tree-sitter code analysis.
|
||
///
|
||
/// Controls which analysis features are enabled when extracting code files.
|
||
public struct TreeSitterProcessConfig: Codable, Sendable, Hashable {
|
||
/// Extract structural items (functions, classes, structs, etc.). Default: true.
|
||
public let structure: Bool
|
||
/// Extract import statements. Default: true.
|
||
public let imports: Bool
|
||
/// Extract export statements. Default: true.
|
||
public let exports: Bool
|
||
/// Extract comments. Default: false.
|
||
public let comments: Bool
|
||
/// Extract docstrings. Default: false.
|
||
public let docstrings: Bool
|
||
/// Extract symbol definitions. Default: false.
|
||
public let symbols: Bool
|
||
/// Include parse diagnostics. Default: false.
|
||
public let diagnostics: Bool
|
||
/// Maximum chunk size in bytes. `None` disables chunking.
|
||
public let chunkMaxSize: UInt?
|
||
/// Content rendering mode for code extraction.
|
||
public let contentMode: CodeContentMode
|
||
public init(structure: Bool, imports: Bool, exports: Bool, comments: Bool, docstrings: Bool, symbols: Bool, diagnostics: Bool, chunkMaxSize: UInt? = nil, contentMode: CodeContentMode) {
|
||
self.structure = structure
|
||
self.imports = imports
|
||
self.exports = exports
|
||
self.comments = comments
|
||
self.docstrings = docstrings
|
||
self.symbols = symbols
|
||
self.diagnostics = diagnostics
|
||
self.chunkMaxSize = chunkMaxSize
|
||
self.contentMode = contentMode
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case structure = "structure"
|
||
case imports = "imports"
|
||
case exports = "exports"
|
||
case comments = "comments"
|
||
case docstrings = "docstrings"
|
||
case symbols = "symbols"
|
||
case diagnostics = "diagnostics"
|
||
case chunkMaxSize = "chunk_max_size"
|
||
case contentMode = "content_mode"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.structure = try container.decodeIfPresent(Bool.self, forKey: .structure) ?? true
|
||
self.imports = try container.decodeIfPresent(Bool.self, forKey: .imports) ?? true
|
||
self.exports = try container.decodeIfPresent(Bool.self, forKey: .exports) ?? true
|
||
self.comments = try container.decodeIfPresent(Bool.self, forKey: .comments) ?? false
|
||
self.docstrings = try container.decodeIfPresent(Bool.self, forKey: .docstrings) ?? false
|
||
self.symbols = try container.decodeIfPresent(Bool.self, forKey: .symbols) ?? false
|
||
self.diagnostics = try container.decodeIfPresent(Bool.self, forKey: .diagnostics) ?? false
|
||
self.chunkMaxSize = try container.decodeIfPresent(UInt.self, forKey: .chunkMaxSize) ?? nil
|
||
self.contentMode = try container.decode(CodeContentMode.self, forKey: .contentMode)
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TreeSitterProcessConfig
|
||
internal extension TreeSitterProcessConfig {
|
||
init(_ rb: RustBridge.TreeSitterProcessConfigRef) throws {
|
||
self.structure = rb.structure()
|
||
self.imports = rb.imports()
|
||
self.exports = rb.exports()
|
||
self.comments = rb.comments()
|
||
self.docstrings = rb.docstrings()
|
||
self.symbols = rb.symbols()
|
||
self.diagnostics = rb.diagnostics()
|
||
self.chunkMaxSize = rb.chunkMaxSize()
|
||
self.contentMode = CodeContentMode(rawValue: rb.contentMode().toString()) ?? { fatalError("Unknown CodeContentMode: \(rb.contentMode().toString())") }()
|
||
}
|
||
func intoRust() throws -> RustBridge.TreeSitterProcessConfig {
|
||
return RustBridge.TreeSitterProcessConfig(self.structure, self.imports, self.exports, self.comments, self.docstrings, self.symbols, self.diagnostics, self.chunkMaxSize, try self.contentMode.intoRust())
|
||
}
|
||
}
|
||
|
||
/// A supported document format entry.
|
||
///
|
||
/// Represents a file extension and its corresponding MIME type that Kreuzberg can process.
|
||
public struct SupportedFormat: Codable, Sendable, Hashable {
|
||
/// File extension (without leading dot), e.g., "pdf", "docx"
|
||
public let `extension`: String
|
||
/// MIME type string, e.g., "application/pdf"
|
||
public let mimeType: String
|
||
public init(`extension`: String, mimeType: String) {
|
||
self.`extension` = `extension`
|
||
self.mimeType = mimeType
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case `extension` = "extension"
|
||
case mimeType = "mime_type"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for SupportedFormat
|
||
internal extension SupportedFormat {
|
||
init(_ rb: RustBridge.SupportedFormatRef) throws {
|
||
self.`extension` = rb.extension_().toString()
|
||
self.mimeType = rb.mimeType().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.SupportedFormat {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.supportedFormatFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// API server configuration.
|
||
///
|
||
/// This struct holds all configuration options for the Kreuzberg API server,
|
||
/// including host/port settings, CORS configuration, and upload limits.
|
||
///
|
||
/// # Defaults
|
||
///
|
||
/// - `host`: "127.0.0.1" (localhost only)
|
||
/// - `port`: 8000
|
||
/// - `cors_origins`: empty vector (allows all origins)
|
||
/// - `max_request_body_bytes`: 104_857_600 (100 MB)
|
||
/// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
||
public struct ServerConfig: Codable, Sendable, Hashable {
|
||
/// Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
||
public let host: String
|
||
/// Server port number
|
||
public let port: UInt16
|
||
/// CORS allowed origins. Empty vector means allow all origins.
|
||
///
|
||
/// If this is an empty vector, the server will accept requests from any origin.
|
||
/// If populated with specific origins (e.g., `"https://example.com"`), only
|
||
/// those origins will be allowed.
|
||
public let corsOrigins: [String]
|
||
/// Maximum size of request body in bytes (default: 100 MB)
|
||
public let maxRequestBodyBytes: UInt
|
||
/// Maximum size of multipart fields in bytes (default: 100 MB)
|
||
public let maxMultipartFieldBytes: UInt
|
||
public init(host: String, port: UInt16, corsOrigins: [String], maxRequestBodyBytes: UInt, maxMultipartFieldBytes: UInt) {
|
||
self.host = host
|
||
self.port = port
|
||
self.corsOrigins = corsOrigins
|
||
self.maxRequestBodyBytes = maxRequestBodyBytes
|
||
self.maxMultipartFieldBytes = maxMultipartFieldBytes
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case host = "host"
|
||
case port = "port"
|
||
case corsOrigins = "cors_origins"
|
||
case maxRequestBodyBytes = "max_request_body_bytes"
|
||
case maxMultipartFieldBytes = "max_multipart_field_bytes"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.host = try container.decodeIfPresent(String.self, forKey: .host) ?? ""
|
||
self.port = try container.decodeIfPresent(UInt16.self, forKey: .port) ?? 0
|
||
self.corsOrigins = try container.decodeIfPresent([String].self, forKey: .corsOrigins) ?? []
|
||
self.maxRequestBodyBytes = try container.decodeIfPresent(UInt.self, forKey: .maxRequestBodyBytes) ?? 0
|
||
self.maxMultipartFieldBytes = try container.decodeIfPresent(UInt.self, forKey: .maxMultipartFieldBytes) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ServerConfig
|
||
internal extension ServerConfig {
|
||
init(_ rb: RustBridge.ServerConfigRef) throws {
|
||
self.host = rb.host().toString()
|
||
self.port = rb.port()
|
||
self.corsOrigins = rb.corsOrigins().map { $0.as_str().toString() }
|
||
self.maxRequestBodyBytes = rb.maxRequestBodyBytes()
|
||
self.maxMultipartFieldBytes = rb.maxMultipartFieldBytes()
|
||
}
|
||
func intoRust() throws -> RustBridge.ServerConfig {
|
||
let __corsOrigins = RustVec<RustString>()
|
||
for __elem in self.corsOrigins { __corsOrigins.push(value: RustString(__elem)) }
|
||
return RustBridge.ServerConfig(RustString(self.host), self.port, __corsOrigins, self.maxRequestBodyBytes, self.maxMultipartFieldBytes)
|
||
}
|
||
}
|
||
|
||
public typealias StructuredDataResult = RustBridge.StructuredDataResult
|
||
|
||
/// Application properties from docProps/app.xml for DOCX
|
||
///
|
||
/// Contains Word-specific document statistics and metadata.
|
||
public struct DocxAppProperties: Codable, Sendable, Hashable {
|
||
/// Application name (e.g., "Microsoft Office Word")
|
||
public let application: String?
|
||
/// Application version
|
||
public let appVersion: String?
|
||
/// Template filename
|
||
public let template: String?
|
||
/// Total editing time in minutes
|
||
public let totalTime: Int32?
|
||
/// Number of pages
|
||
public let pages: Int32?
|
||
/// Number of words
|
||
public let words: Int32?
|
||
/// Number of characters (excluding spaces)
|
||
public let characters: Int32?
|
||
/// Number of characters (including spaces)
|
||
public let charactersWithSpaces: Int32?
|
||
/// Number of lines
|
||
public let lines: Int32?
|
||
/// Number of paragraphs
|
||
public let paragraphs: Int32?
|
||
/// Company name
|
||
public let company: String?
|
||
/// Document security level
|
||
public let docSecurity: Int32?
|
||
/// Scale crop flag
|
||
public let scaleCrop: Bool?
|
||
/// Links up to date flag
|
||
public let linksUpToDate: Bool?
|
||
/// Shared document flag
|
||
public let sharedDoc: Bool?
|
||
/// Hyperlinks changed flag
|
||
public let hyperlinksChanged: Bool?
|
||
public init(application: String? = nil, appVersion: String? = nil, template: String? = nil, totalTime: Int32? = nil, pages: Int32? = nil, words: Int32? = nil, characters: Int32? = nil, charactersWithSpaces: Int32? = nil, lines: Int32? = nil, paragraphs: Int32? = nil, company: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil) {
|
||
self.application = application
|
||
self.appVersion = appVersion
|
||
self.template = template
|
||
self.totalTime = totalTime
|
||
self.pages = pages
|
||
self.words = words
|
||
self.characters = characters
|
||
self.charactersWithSpaces = charactersWithSpaces
|
||
self.lines = lines
|
||
self.paragraphs = paragraphs
|
||
self.company = company
|
||
self.docSecurity = docSecurity
|
||
self.scaleCrop = scaleCrop
|
||
self.linksUpToDate = linksUpToDate
|
||
self.sharedDoc = sharedDoc
|
||
self.hyperlinksChanged = hyperlinksChanged
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case application = "application"
|
||
case appVersion = "app_version"
|
||
case template = "template"
|
||
case totalTime = "total_time"
|
||
case pages = "pages"
|
||
case words = "words"
|
||
case characters = "characters"
|
||
case charactersWithSpaces = "characters_with_spaces"
|
||
case lines = "lines"
|
||
case paragraphs = "paragraphs"
|
||
case company = "company"
|
||
case docSecurity = "doc_security"
|
||
case scaleCrop = "scale_crop"
|
||
case linksUpToDate = "links_up_to_date"
|
||
case sharedDoc = "shared_doc"
|
||
case hyperlinksChanged = "hyperlinks_changed"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
|
||
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
|
||
self.template = try container.decodeIfPresent(String.self, forKey: .template) ?? nil
|
||
self.totalTime = try container.decodeIfPresent(Int32.self, forKey: .totalTime) ?? nil
|
||
self.pages = try container.decodeIfPresent(Int32.self, forKey: .pages) ?? nil
|
||
self.words = try container.decodeIfPresent(Int32.self, forKey: .words) ?? nil
|
||
self.characters = try container.decodeIfPresent(Int32.self, forKey: .characters) ?? nil
|
||
self.charactersWithSpaces = try container.decodeIfPresent(Int32.self, forKey: .charactersWithSpaces) ?? nil
|
||
self.lines = try container.decodeIfPresent(Int32.self, forKey: .lines) ?? nil
|
||
self.paragraphs = try container.decodeIfPresent(Int32.self, forKey: .paragraphs) ?? nil
|
||
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
|
||
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
|
||
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
|
||
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
|
||
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
|
||
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DocxAppProperties
|
||
internal extension DocxAppProperties {
|
||
init(_ rb: RustBridge.DocxAppPropertiesRef) throws {
|
||
self.application = rb.application()?.toString()
|
||
self.appVersion = rb.appVersion()?.toString()
|
||
self.template = rb.template()?.toString()
|
||
self.totalTime = rb.totalTime()
|
||
self.pages = rb.pages()
|
||
self.words = rb.words()
|
||
self.characters = rb.characters()
|
||
self.charactersWithSpaces = rb.charactersWithSpaces()
|
||
self.lines = rb.lines()
|
||
self.paragraphs = rb.paragraphs()
|
||
self.company = rb.company()?.toString()
|
||
self.docSecurity = rb.docSecurity()
|
||
self.scaleCrop = rb.scaleCrop()
|
||
self.linksUpToDate = rb.linksUpToDate()
|
||
self.sharedDoc = rb.sharedDoc()
|
||
self.hyperlinksChanged = rb.hyperlinksChanged()
|
||
}
|
||
func intoRust() throws -> RustBridge.DocxAppProperties {
|
||
return RustBridge.DocxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.template.map(RustString.init), self.totalTime, self.pages, self.words, self.characters, self.charactersWithSpaces, self.lines, self.paragraphs, self.company.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged)
|
||
}
|
||
}
|
||
|
||
/// Application properties from docProps/app.xml for XLSX
|
||
///
|
||
/// Contains Excel-specific document metadata.
|
||
public struct XlsxAppProperties: Codable, Sendable, Hashable {
|
||
/// Application name (e.g., "Microsoft Excel")
|
||
public let application: String?
|
||
/// Application version
|
||
public let appVersion: String?
|
||
/// Document security level
|
||
public let docSecurity: Int32?
|
||
/// Scale crop flag
|
||
public let scaleCrop: Bool?
|
||
/// Links up to date flag
|
||
public let linksUpToDate: Bool?
|
||
/// Shared document flag
|
||
public let sharedDoc: Bool?
|
||
/// Hyperlinks changed flag
|
||
public let hyperlinksChanged: Bool?
|
||
/// Company name
|
||
public let company: String?
|
||
/// Worksheet names
|
||
public let worksheetNames: [String]
|
||
public init(application: String? = nil, appVersion: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil, company: String? = nil, worksheetNames: [String]) {
|
||
self.application = application
|
||
self.appVersion = appVersion
|
||
self.docSecurity = docSecurity
|
||
self.scaleCrop = scaleCrop
|
||
self.linksUpToDate = linksUpToDate
|
||
self.sharedDoc = sharedDoc
|
||
self.hyperlinksChanged = hyperlinksChanged
|
||
self.company = company
|
||
self.worksheetNames = worksheetNames
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case application = "application"
|
||
case appVersion = "app_version"
|
||
case docSecurity = "doc_security"
|
||
case scaleCrop = "scale_crop"
|
||
case linksUpToDate = "links_up_to_date"
|
||
case sharedDoc = "shared_doc"
|
||
case hyperlinksChanged = "hyperlinks_changed"
|
||
case company = "company"
|
||
case worksheetNames = "worksheet_names"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
|
||
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
|
||
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
|
||
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
|
||
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
|
||
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
|
||
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
|
||
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
|
||
self.worksheetNames = try container.decodeIfPresent([String].self, forKey: .worksheetNames) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for XlsxAppProperties
|
||
internal extension XlsxAppProperties {
|
||
init(_ rb: RustBridge.XlsxAppPropertiesRef) throws {
|
||
self.application = rb.application()?.toString()
|
||
self.appVersion = rb.appVersion()?.toString()
|
||
self.docSecurity = rb.docSecurity()
|
||
self.scaleCrop = rb.scaleCrop()
|
||
self.linksUpToDate = rb.linksUpToDate()
|
||
self.sharedDoc = rb.sharedDoc()
|
||
self.hyperlinksChanged = rb.hyperlinksChanged()
|
||
self.company = rb.company()?.toString()
|
||
self.worksheetNames = rb.worksheetNames().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.XlsxAppProperties {
|
||
let __worksheetNames = RustVec<RustString>()
|
||
for __elem in self.worksheetNames { __worksheetNames.push(value: RustString(__elem)) }
|
||
return RustBridge.XlsxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged, self.company.map(RustString.init), __worksheetNames)
|
||
}
|
||
}
|
||
|
||
/// Application properties from docProps/app.xml for PPTX
|
||
///
|
||
/// Contains PowerPoint-specific document metadata.
|
||
public struct PptxAppProperties: Codable, Sendable, Hashable {
|
||
/// Application name (e.g., "Microsoft Office PowerPoint")
|
||
public let application: String?
|
||
/// Application version
|
||
public let appVersion: String?
|
||
/// Total editing time in minutes
|
||
public let totalTime: Int32?
|
||
/// Company name
|
||
public let company: String?
|
||
/// Document security level
|
||
public let docSecurity: Int32?
|
||
/// Scale crop flag
|
||
public let scaleCrop: Bool?
|
||
/// Links up to date flag
|
||
public let linksUpToDate: Bool?
|
||
/// Shared document flag
|
||
public let sharedDoc: Bool?
|
||
/// Hyperlinks changed flag
|
||
public let hyperlinksChanged: Bool?
|
||
/// Number of slides
|
||
public let slides: Int32?
|
||
/// Number of notes
|
||
public let notes: Int32?
|
||
/// Number of hidden slides
|
||
public let hiddenSlides: Int32?
|
||
/// Number of multimedia clips
|
||
public let multimediaClips: Int32?
|
||
/// Presentation format (e.g., "Widescreen", "Standard")
|
||
public let presentationFormat: String?
|
||
/// Slide titles
|
||
public let slideTitles: [String]
|
||
public init(application: String? = nil, appVersion: String? = nil, totalTime: Int32? = nil, company: String? = nil, docSecurity: Int32? = nil, scaleCrop: Bool? = nil, linksUpToDate: Bool? = nil, sharedDoc: Bool? = nil, hyperlinksChanged: Bool? = nil, slides: Int32? = nil, notes: Int32? = nil, hiddenSlides: Int32? = nil, multimediaClips: Int32? = nil, presentationFormat: String? = nil, slideTitles: [String]) {
|
||
self.application = application
|
||
self.appVersion = appVersion
|
||
self.totalTime = totalTime
|
||
self.company = company
|
||
self.docSecurity = docSecurity
|
||
self.scaleCrop = scaleCrop
|
||
self.linksUpToDate = linksUpToDate
|
||
self.sharedDoc = sharedDoc
|
||
self.hyperlinksChanged = hyperlinksChanged
|
||
self.slides = slides
|
||
self.notes = notes
|
||
self.hiddenSlides = hiddenSlides
|
||
self.multimediaClips = multimediaClips
|
||
self.presentationFormat = presentationFormat
|
||
self.slideTitles = slideTitles
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case application = "application"
|
||
case appVersion = "app_version"
|
||
case totalTime = "total_time"
|
||
case company = "company"
|
||
case docSecurity = "doc_security"
|
||
case scaleCrop = "scale_crop"
|
||
case linksUpToDate = "links_up_to_date"
|
||
case sharedDoc = "shared_doc"
|
||
case hyperlinksChanged = "hyperlinks_changed"
|
||
case slides = "slides"
|
||
case notes = "notes"
|
||
case hiddenSlides = "hidden_slides"
|
||
case multimediaClips = "multimedia_clips"
|
||
case presentationFormat = "presentation_format"
|
||
case slideTitles = "slide_titles"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.application = try container.decodeIfPresent(String.self, forKey: .application) ?? nil
|
||
self.appVersion = try container.decodeIfPresent(String.self, forKey: .appVersion) ?? nil
|
||
self.totalTime = try container.decodeIfPresent(Int32.self, forKey: .totalTime) ?? nil
|
||
self.company = try container.decodeIfPresent(String.self, forKey: .company) ?? nil
|
||
self.docSecurity = try container.decodeIfPresent(Int32.self, forKey: .docSecurity) ?? nil
|
||
self.scaleCrop = try container.decodeIfPresent(Bool.self, forKey: .scaleCrop) ?? nil
|
||
self.linksUpToDate = try container.decodeIfPresent(Bool.self, forKey: .linksUpToDate) ?? nil
|
||
self.sharedDoc = try container.decodeIfPresent(Bool.self, forKey: .sharedDoc) ?? nil
|
||
self.hyperlinksChanged = try container.decodeIfPresent(Bool.self, forKey: .hyperlinksChanged) ?? nil
|
||
self.slides = try container.decodeIfPresent(Int32.self, forKey: .slides) ?? nil
|
||
self.notes = try container.decodeIfPresent(Int32.self, forKey: .notes) ?? nil
|
||
self.hiddenSlides = try container.decodeIfPresent(Int32.self, forKey: .hiddenSlides) ?? nil
|
||
self.multimediaClips = try container.decodeIfPresent(Int32.self, forKey: .multimediaClips) ?? nil
|
||
self.presentationFormat = try container.decodeIfPresent(String.self, forKey: .presentationFormat) ?? nil
|
||
self.slideTitles = try container.decodeIfPresent([String].self, forKey: .slideTitles) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PptxAppProperties
|
||
internal extension PptxAppProperties {
|
||
init(_ rb: RustBridge.PptxAppPropertiesRef) throws {
|
||
self.application = rb.application()?.toString()
|
||
self.appVersion = rb.appVersion()?.toString()
|
||
self.totalTime = rb.totalTime()
|
||
self.company = rb.company()?.toString()
|
||
self.docSecurity = rb.docSecurity()
|
||
self.scaleCrop = rb.scaleCrop()
|
||
self.linksUpToDate = rb.linksUpToDate()
|
||
self.sharedDoc = rb.sharedDoc()
|
||
self.hyperlinksChanged = rb.hyperlinksChanged()
|
||
self.slides = rb.slides()
|
||
self.notes = rb.notes()
|
||
self.hiddenSlides = rb.hiddenSlides()
|
||
self.multimediaClips = rb.multimediaClips()
|
||
self.presentationFormat = rb.presentationFormat()?.toString()
|
||
self.slideTitles = rb.slideTitles().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.PptxAppProperties {
|
||
let __slideTitles = RustVec<RustString>()
|
||
for __elem in self.slideTitles { __slideTitles.push(value: RustString(__elem)) }
|
||
return RustBridge.PptxAppProperties(self.application.map(RustString.init), self.appVersion.map(RustString.init), self.totalTime, self.company.map(RustString.init), self.docSecurity, self.scaleCrop, self.linksUpToDate, self.sharedDoc, self.hyperlinksChanged, self.slides, self.notes, self.hiddenSlides, self.multimediaClips, self.presentationFormat.map(RustString.init), __slideTitles)
|
||
}
|
||
}
|
||
|
||
/// Dublin Core metadata from docProps/core.xml
|
||
///
|
||
/// Contains standard metadata fields defined by the Dublin Core standard
|
||
/// and Office-specific extensions.
|
||
public struct CoreProperties: Codable, Sendable, Hashable {
|
||
/// Document title
|
||
public let title: String?
|
||
/// Document subject/topic
|
||
public let subject: String?
|
||
/// Document creator/author
|
||
public let creator: String?
|
||
/// Keywords or tags
|
||
public let keywords: String?
|
||
/// Document description/abstract
|
||
public let description: String?
|
||
/// User who last modified the document
|
||
public let lastModifiedBy: String?
|
||
/// Revision number
|
||
public let revision: String?
|
||
/// Creation timestamp (ISO 8601)
|
||
public let created: String?
|
||
/// Last modification timestamp (ISO 8601)
|
||
public let modified: String?
|
||
/// Document category
|
||
public let category: String?
|
||
/// Content status (Draft, Final, etc.)
|
||
public let contentStatus: String?
|
||
/// Document language
|
||
public let language: String?
|
||
/// Unique identifier
|
||
public let identifier: String?
|
||
/// Document version
|
||
public let version: String?
|
||
/// Last print timestamp (ISO 8601)
|
||
public let lastPrinted: String?
|
||
public init(title: String? = nil, subject: String? = nil, creator: String? = nil, keywords: String? = nil, description: String? = nil, lastModifiedBy: String? = nil, revision: String? = nil, created: String? = nil, modified: String? = nil, category: String? = nil, contentStatus: String? = nil, language: String? = nil, identifier: String? = nil, version: String? = nil, lastPrinted: String? = nil) {
|
||
self.title = title
|
||
self.subject = subject
|
||
self.creator = creator
|
||
self.keywords = keywords
|
||
self.description = description
|
||
self.lastModifiedBy = lastModifiedBy
|
||
self.revision = revision
|
||
self.created = created
|
||
self.modified = modified
|
||
self.category = category
|
||
self.contentStatus = contentStatus
|
||
self.language = language
|
||
self.identifier = identifier
|
||
self.version = version
|
||
self.lastPrinted = lastPrinted
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case title = "title"
|
||
case subject = "subject"
|
||
case creator = "creator"
|
||
case keywords = "keywords"
|
||
case description = "description"
|
||
case lastModifiedBy = "last_modified_by"
|
||
case revision = "revision"
|
||
case created = "created"
|
||
case modified = "modified"
|
||
case category = "category"
|
||
case contentStatus = "content_status"
|
||
case language = "language"
|
||
case identifier = "identifier"
|
||
case version = "version"
|
||
case lastPrinted = "last_printed"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.title = try container.decodeIfPresent(String.self, forKey: .title) ?? nil
|
||
self.subject = try container.decodeIfPresent(String.self, forKey: .subject) ?? nil
|
||
self.creator = try container.decodeIfPresent(String.self, forKey: .creator) ?? nil
|
||
self.keywords = try container.decodeIfPresent(String.self, forKey: .keywords) ?? nil
|
||
self.description = try container.decodeIfPresent(String.self, forKey: .description) ?? nil
|
||
self.lastModifiedBy = try container.decodeIfPresent(String.self, forKey: .lastModifiedBy) ?? nil
|
||
self.revision = try container.decodeIfPresent(String.self, forKey: .revision) ?? nil
|
||
self.created = try container.decodeIfPresent(String.self, forKey: .created) ?? nil
|
||
self.modified = try container.decodeIfPresent(String.self, forKey: .modified) ?? nil
|
||
self.category = try container.decodeIfPresent(String.self, forKey: .category) ?? nil
|
||
self.contentStatus = try container.decodeIfPresent(String.self, forKey: .contentStatus) ?? nil
|
||
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? nil
|
||
self.identifier = try container.decodeIfPresent(String.self, forKey: .identifier) ?? nil
|
||
self.version = try container.decodeIfPresent(String.self, forKey: .version) ?? nil
|
||
self.lastPrinted = try container.decodeIfPresent(String.self, forKey: .lastPrinted) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for CoreProperties
|
||
internal extension CoreProperties {
|
||
init(_ rb: RustBridge.CorePropertiesRef) throws {
|
||
self.title = rb.title()?.toString()
|
||
self.subject = rb.subject()?.toString()
|
||
self.creator = rb.creator()?.toString()
|
||
self.keywords = rb.keywords()?.toString()
|
||
self.description = rb.description()?.toString()
|
||
self.lastModifiedBy = rb.lastModifiedBy()?.toString()
|
||
self.revision = rb.revision()?.toString()
|
||
self.created = rb.created()?.toString()
|
||
self.modified = rb.modified()?.toString()
|
||
self.category = rb.category()?.toString()
|
||
self.contentStatus = rb.contentStatus()?.toString()
|
||
self.language = rb.language()?.toString()
|
||
self.identifier = rb.identifier()?.toString()
|
||
self.version = rb.version()?.toString()
|
||
self.lastPrinted = rb.lastPrinted()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.CoreProperties {
|
||
return RustBridge.CoreProperties(self.title.map(RustString.init), self.subject.map(RustString.init), self.creator.map(RustString.init), self.keywords.map(RustString.init), self.description.map(RustString.init), self.lastModifiedBy.map(RustString.init), self.revision.map(RustString.init), self.created.map(RustString.init), self.modified.map(RustString.init), self.category.map(RustString.init), self.contentStatus.map(RustString.init), self.language.map(RustString.init), self.identifier.map(RustString.init), self.version.map(RustString.init), self.lastPrinted.map(RustString.init))
|
||
}
|
||
}
|
||
|
||
/// Configuration for security limits across extractors.
|
||
///
|
||
/// All limits are intentionally conservative to prevent DoS attacks
|
||
/// while still supporting legitimate documents.
|
||
public struct SecurityLimits: Codable, Sendable, Hashable {
|
||
/// Maximum uncompressed size for archives (500 MB)
|
||
public let maxArchiveSize: UInt
|
||
/// Maximum compression ratio before flagging as potential bomb (100:1)
|
||
public let maxCompressionRatio: UInt
|
||
/// Maximum number of files in archive (10,000)
|
||
public let maxFilesInArchive: UInt
|
||
/// Maximum nesting depth for structures (100)
|
||
public let maxNestingDepth: UInt
|
||
/// Maximum length of any single XML entity / attribute / token (1 MiB).
|
||
/// This is a per-token cap, NOT a total cap — billion-laughs class
|
||
/// attacks where a single entity expands to hundreds of MB are caught
|
||
/// here, while normal long text content (a paragraph, a CDATA block) is
|
||
/// caught by `max_content_size` instead.
|
||
public let maxEntityLength: UInt
|
||
/// Maximum string growth per document (100 MB)
|
||
public let maxContentSize: UInt
|
||
/// Maximum iterations per operation
|
||
public let maxIterations: UInt
|
||
/// Maximum XML depth (100 levels)
|
||
public let maxXmlDepth: UInt
|
||
/// Maximum cells per table (100,000)
|
||
public let maxTableCells: UInt
|
||
public init(maxArchiveSize: UInt, maxCompressionRatio: UInt, maxFilesInArchive: UInt, maxNestingDepth: UInt, maxEntityLength: UInt, maxContentSize: UInt, maxIterations: UInt, maxXmlDepth: UInt, maxTableCells: UInt) {
|
||
self.maxArchiveSize = maxArchiveSize
|
||
self.maxCompressionRatio = maxCompressionRatio
|
||
self.maxFilesInArchive = maxFilesInArchive
|
||
self.maxNestingDepth = maxNestingDepth
|
||
self.maxEntityLength = maxEntityLength
|
||
self.maxContentSize = maxContentSize
|
||
self.maxIterations = maxIterations
|
||
self.maxXmlDepth = maxXmlDepth
|
||
self.maxTableCells = maxTableCells
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case maxArchiveSize = "max_archive_size"
|
||
case maxCompressionRatio = "max_compression_ratio"
|
||
case maxFilesInArchive = "max_files_in_archive"
|
||
case maxNestingDepth = "max_nesting_depth"
|
||
case maxEntityLength = "max_entity_length"
|
||
case maxContentSize = "max_content_size"
|
||
case maxIterations = "max_iterations"
|
||
case maxXmlDepth = "max_xml_depth"
|
||
case maxTableCells = "max_table_cells"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.maxArchiveSize = try container.decodeIfPresent(UInt.self, forKey: .maxArchiveSize) ?? 524288000
|
||
self.maxCompressionRatio = try container.decodeIfPresent(UInt.self, forKey: .maxCompressionRatio) ?? 100
|
||
self.maxFilesInArchive = try container.decodeIfPresent(UInt.self, forKey: .maxFilesInArchive) ?? 10000
|
||
self.maxNestingDepth = try container.decodeIfPresent(UInt.self, forKey: .maxNestingDepth) ?? 1024
|
||
self.maxEntityLength = try container.decodeIfPresent(UInt.self, forKey: .maxEntityLength) ?? 1048576
|
||
self.maxContentSize = try container.decodeIfPresent(UInt.self, forKey: .maxContentSize) ?? 104857600
|
||
self.maxIterations = try container.decodeIfPresent(UInt.self, forKey: .maxIterations) ?? 10000000
|
||
self.maxXmlDepth = try container.decodeIfPresent(UInt.self, forKey: .maxXmlDepth) ?? 1024
|
||
self.maxTableCells = try container.decodeIfPresent(UInt.self, forKey: .maxTableCells) ?? 100000
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for SecurityLimits
|
||
internal extension SecurityLimits {
|
||
init(_ rb: RustBridge.SecurityLimitsRef) throws {
|
||
self.maxArchiveSize = rb.maxArchiveSize()
|
||
self.maxCompressionRatio = rb.maxCompressionRatio()
|
||
self.maxFilesInArchive = rb.maxFilesInArchive()
|
||
self.maxNestingDepth = rb.maxNestingDepth()
|
||
self.maxEntityLength = rb.maxEntityLength()
|
||
self.maxContentSize = rb.maxContentSize()
|
||
self.maxIterations = rb.maxIterations()
|
||
self.maxXmlDepth = rb.maxXmlDepth()
|
||
self.maxTableCells = rb.maxTableCells()
|
||
}
|
||
func intoRust() throws -> RustBridge.SecurityLimits {
|
||
return RustBridge.SecurityLimits(self.maxArchiveSize, self.maxCompressionRatio, self.maxFilesInArchive, self.maxNestingDepth, self.maxEntityLength, self.maxContentSize, self.maxIterations, self.maxXmlDepth, self.maxTableCells)
|
||
}
|
||
}
|
||
|
||
public typealias TokenReductionConfig = RustBridge.TokenReductionConfig
|
||
|
||
/// A PDF annotation extracted from a document page.
|
||
public struct PdfAnnotation: Codable, Sendable, Hashable {
|
||
/// The type of annotation.
|
||
public let annotationType: PdfAnnotationType
|
||
/// Text content of the annotation (e.g., comment text, link URL).
|
||
public let content: String?
|
||
/// Page number where the annotation appears (1-indexed).
|
||
public let pageNumber: UInt32
|
||
/// Bounding box of the annotation on the page.
|
||
public let boundingBox: BoundingBox?
|
||
public init(annotationType: PdfAnnotationType, content: String? = nil, pageNumber: UInt32, boundingBox: BoundingBox? = nil) {
|
||
self.annotationType = annotationType
|
||
self.content = content
|
||
self.pageNumber = pageNumber
|
||
self.boundingBox = boundingBox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case annotationType = "annotation_type"
|
||
case content = "content"
|
||
case pageNumber = "page_number"
|
||
case boundingBox = "bounding_box"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PdfAnnotation
|
||
internal extension PdfAnnotation {
|
||
init(_ rb: RustBridge.PdfAnnotationRef) throws {
|
||
self.annotationType = PdfAnnotationType(rawValue: rb.annotationType().toString()) ?? { fatalError("Unknown PdfAnnotationType: \(rb.annotationType().toString())") }()
|
||
self.content = rb.content()?.toString()
|
||
self.pageNumber = rb.pageNumber()
|
||
self.boundingBox = try rb.boundingBox().map { try BoundingBox($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.PdfAnnotation {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pdfAnnotationFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Comprehensive Djot document structure with semantic preservation.
|
||
///
|
||
/// This type captures the full richness of Djot markup, including:
|
||
/// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
||
/// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
||
/// - Attributes (classes, IDs, key-value pairs)
|
||
/// - Links, images, footnotes
|
||
/// - Math expressions (inline and display)
|
||
/// - Tables with full structure
|
||
///
|
||
/// Available when the `djot` feature is enabled.
|
||
public typealias DjotContent = RustBridge.DjotContent
|
||
|
||
/// Block-level element in a Djot document.
|
||
///
|
||
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
||
public typealias FormattedBlock = RustBridge.FormattedBlock
|
||
|
||
/// Inline element within a block.
|
||
///
|
||
/// Represents text with formatting, links, images, etc.
|
||
public typealias InlineElement = RustBridge.InlineElement
|
||
|
||
/// Image element in Djot.
|
||
public struct DjotImage: Codable, Sendable, Hashable {
|
||
/// Image source URL or path
|
||
public let src: String
|
||
/// Alternative text
|
||
public let alt: String
|
||
/// Optional title
|
||
public let title: String?
|
||
/// Element attributes
|
||
public let attributes: String?
|
||
public init(src: String, alt: String, title: String? = nil, attributes: String? = nil) {
|
||
self.src = src
|
||
self.alt = alt
|
||
self.title = title
|
||
self.attributes = attributes
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DjotImage
|
||
internal extension DjotImage {
|
||
init(_ rb: RustBridge.DjotImageRef) throws {
|
||
self.src = rb.src().toString()
|
||
self.alt = rb.alt().toString()
|
||
self.title = rb.title()?.toString()
|
||
self.attributes = rb.attributes()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.DjotImage {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.djotImageFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Link element in Djot.
|
||
public struct DjotLink: Codable, Sendable, Hashable {
|
||
/// Link URL
|
||
public let url: String
|
||
/// Link text content
|
||
public let text: String
|
||
/// Optional title
|
||
public let title: String?
|
||
/// Element attributes
|
||
public let attributes: String?
|
||
public init(url: String, text: String, title: String? = nil, attributes: String? = nil) {
|
||
self.url = url
|
||
self.text = text
|
||
self.title = title
|
||
self.attributes = attributes
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DjotLink
|
||
internal extension DjotLink {
|
||
init(_ rb: RustBridge.DjotLinkRef) throws {
|
||
self.url = rb.url().toString()
|
||
self.text = rb.text().toString()
|
||
self.title = rb.title()?.toString()
|
||
self.attributes = rb.attributes()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.DjotLink {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.djotLinkFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Footnote in Djot.
|
||
public typealias Footnote = RustBridge.Footnote
|
||
|
||
/// Top-level structured document representation.
|
||
///
|
||
/// A flat array of nodes with index-based parent/child references forming a tree.
|
||
/// Root-level nodes have `parent: None`. Use `body_roots()` and `furniture_roots()`
|
||
/// to iterate over top-level content by layer.
|
||
///
|
||
/// # Validation
|
||
///
|
||
/// Call `validate()` after construction to verify all node indices are in bounds
|
||
/// and parent-child relationships are bidirectionally consistent.
|
||
public typealias DocumentStructure = RustBridge.DocumentStructure
|
||
|
||
/// A resolved relationship between two nodes in the document tree.
|
||
public struct DocumentRelationship: Codable, Sendable, Hashable {
|
||
/// Source node index (the referencing node).
|
||
public let source: UInt32
|
||
/// Target node index (the referenced node).
|
||
public let target: UInt32
|
||
/// Semantic kind of the relationship.
|
||
public let kind: RelationshipKind
|
||
public init(source: UInt32, target: UInt32, kind: RelationshipKind) {
|
||
self.source = source
|
||
self.target = target
|
||
self.kind = kind
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DocumentRelationship
|
||
internal extension DocumentRelationship {
|
||
init(_ rb: RustBridge.DocumentRelationshipRef) throws {
|
||
self.source = rb.source()
|
||
self.target = rb.target()
|
||
self.kind = RelationshipKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown RelationshipKind: \(rb.kind().toString())") }()
|
||
}
|
||
func intoRust() throws -> RustBridge.DocumentRelationship {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.documentRelationshipFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A single node in the document tree.
|
||
///
|
||
/// Each node has deterministic `id`, typed `content`, optional `parent`/`children`
|
||
/// for tree structure, and metadata like page number, bounding box, and content layer.
|
||
public typealias DocumentNode = RustBridge.DocumentNode
|
||
|
||
/// Structured table grid with cell-level metadata.
|
||
///
|
||
/// Stores row/column dimensions and a flat list of cells with position info.
|
||
public struct TableGrid: Codable, Sendable, Hashable {
|
||
/// Number of rows in the table.
|
||
public let rows: UInt32
|
||
/// Number of columns in the table.
|
||
public let cols: UInt32
|
||
/// All cells in row-major order.
|
||
public let cells: [GridCell]
|
||
public init(rows: UInt32, cols: UInt32, cells: [GridCell]) {
|
||
self.rows = rows
|
||
self.cols = cols
|
||
self.cells = cells
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case rows = "rows"
|
||
case cols = "cols"
|
||
case cells = "cells"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.rows = try container.decodeIfPresent(UInt32.self, forKey: .rows) ?? 0
|
||
self.cols = try container.decodeIfPresent(UInt32.self, forKey: .cols) ?? 0
|
||
self.cells = try container.decodeIfPresent([GridCell].self, forKey: .cells) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TableGrid
|
||
internal extension TableGrid {
|
||
init(_ rb: RustBridge.TableGridRef) throws {
|
||
self.rows = rb.rows()
|
||
self.cols = rb.cols()
|
||
self.cells = try rb.cells().map { try GridCell($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.TableGrid {
|
||
let __cells = RustVec<RustBridge.GridCell>()
|
||
for __elem in self.cells { __cells.push(value: try __elem.intoRust()) }
|
||
return RustBridge.TableGrid(self.rows, self.cols, __cells)
|
||
}
|
||
}
|
||
|
||
/// Individual grid cell with position and span metadata.
|
||
public struct GridCell: Codable, Sendable, Hashable {
|
||
/// Cell text content.
|
||
public let content: String
|
||
/// Zero-indexed row position.
|
||
public let row: UInt32
|
||
/// Zero-indexed column position.
|
||
public let col: UInt32
|
||
/// Number of rows this cell spans.
|
||
public let rowSpan: UInt32
|
||
/// Number of columns this cell spans.
|
||
public let colSpan: UInt32
|
||
/// Whether this is a header cell.
|
||
public let isHeader: Bool
|
||
/// Bounding box for this cell (if available).
|
||
public let bbox: BoundingBox?
|
||
public init(content: String, row: UInt32, col: UInt32, rowSpan: UInt32, colSpan: UInt32, isHeader: Bool, bbox: BoundingBox? = nil) {
|
||
self.content = content
|
||
self.row = row
|
||
self.col = col
|
||
self.rowSpan = rowSpan
|
||
self.colSpan = colSpan
|
||
self.isHeader = isHeader
|
||
self.bbox = bbox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case row = "row"
|
||
case col = "col"
|
||
case rowSpan = "row_span"
|
||
case colSpan = "col_span"
|
||
case isHeader = "is_header"
|
||
case bbox = "bbox"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for GridCell
|
||
internal extension GridCell {
|
||
init(_ rb: RustBridge.GridCellRef) throws {
|
||
self.content = rb.content().toString()
|
||
self.row = rb.row()
|
||
self.col = rb.col()
|
||
self.rowSpan = rb.rowSpan()
|
||
self.colSpan = rb.colSpan()
|
||
self.isHeader = rb.isHeader()
|
||
self.bbox = try rb.bbox().map { try BoundingBox($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.GridCell {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.gridCellFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Inline text annotation — byte-range based formatting and links.
|
||
///
|
||
/// Annotations reference byte offsets into the node's text content,
|
||
/// enabling precise identification of formatted regions.
|
||
public struct TextAnnotation: Codable, Sendable, Hashable {
|
||
/// Start byte offset in the node's text content (inclusive).
|
||
public let start: UInt32
|
||
/// End byte offset in the node's text content (exclusive).
|
||
public let end: UInt32
|
||
/// Annotation type.
|
||
public let kind: AnnotationKind
|
||
public init(start: UInt32, end: UInt32, kind: AnnotationKind) {
|
||
self.start = start
|
||
self.end = end
|
||
self.kind = kind
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TextAnnotation
|
||
internal extension TextAnnotation {
|
||
init(_ rb: RustBridge.TextAnnotationRef) throws {
|
||
self.start = rb.start()
|
||
self.end = rb.end()
|
||
self.kind = try JSONDecoder().decode(AnnotationKind.self, from: ((rb.kind().toString()).data(using: .utf8) ?? Data("null".utf8)))
|
||
}
|
||
func intoRust() throws -> RustBridge.TextAnnotation {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.textAnnotationFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// General extraction result used by the core extraction API.
|
||
///
|
||
/// This is the main result type returned by all extraction functions.
|
||
public typealias ExtractionResult = RustBridge.ExtractionResult
|
||
|
||
/// A single file extracted from an archive.
|
||
///
|
||
/// When archives (ZIP, TAR, 7Z, GZIP) are extracted with recursive extraction
|
||
/// enabled, each processable file produces its own full `ExtractionResult`.
|
||
public typealias ArchiveEntry = RustBridge.ArchiveEntry
|
||
|
||
/// A non-fatal warning from a processing pipeline stage.
|
||
///
|
||
/// Captures errors from optional features that don't prevent extraction
|
||
/// but may indicate degraded results.
|
||
public struct ProcessingWarning: Codable, Sendable, Hashable {
|
||
/// The pipeline stage or feature that produced this warning
|
||
/// (e.g., "embedding", "chunking", "language_detection", "output_format").
|
||
public let source: String
|
||
/// Human-readable description of what went wrong.
|
||
public let message: String
|
||
public init(source: String, message: String) {
|
||
self.source = source
|
||
self.message = message
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ProcessingWarning
|
||
internal extension ProcessingWarning {
|
||
init(_ rb: RustBridge.ProcessingWarningRef) throws {
|
||
self.source = rb.source().toString()
|
||
self.message = rb.message().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.ProcessingWarning {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.processingWarningFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Token usage and cost data for a single LLM call made during extraction.
|
||
///
|
||
/// Populated when VLM OCR, structured extraction, or LLM-based embeddings
|
||
/// are used. Multiple entries may be present when multiple LLM calls occur
|
||
/// within one extraction (e.g. VLM OCR + structured extraction).
|
||
public struct LlmUsage: Codable, Sendable, Hashable {
|
||
/// The LLM model identifier (e.g. "openai/gpt-4o", "anthropic/claude-sonnet-4-20250514").
|
||
public let model: String
|
||
/// The pipeline stage that triggered this LLM call
|
||
/// (e.g. "vlm_ocr", "structured_extraction", "embeddings").
|
||
public let source: String
|
||
/// Number of input/prompt tokens consumed.
|
||
public let inputTokens: UInt64?
|
||
/// Number of output/completion tokens generated.
|
||
public let outputTokens: UInt64?
|
||
/// Total tokens (input + output).
|
||
public let totalTokens: UInt64?
|
||
/// Estimated cost in USD based on the provider's published pricing.
|
||
public let estimatedCost: Double?
|
||
/// Why the model stopped generating (e.g. "stop", "length", "content_filter").
|
||
public let finishReason: String?
|
||
public init(model: String, source: String, inputTokens: UInt64? = nil, outputTokens: UInt64? = nil, totalTokens: UInt64? = nil, estimatedCost: Double? = nil, finishReason: String? = nil) {
|
||
self.model = model
|
||
self.source = source
|
||
self.inputTokens = inputTokens
|
||
self.outputTokens = outputTokens
|
||
self.totalTokens = totalTokens
|
||
self.estimatedCost = estimatedCost
|
||
self.finishReason = finishReason
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case model = "model"
|
||
case source = "source"
|
||
case inputTokens = "input_tokens"
|
||
case outputTokens = "output_tokens"
|
||
case totalTokens = "total_tokens"
|
||
case estimatedCost = "estimated_cost"
|
||
case finishReason = "finish_reason"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.model = try container.decodeIfPresent(String.self, forKey: .model) ?? ""
|
||
self.source = try container.decodeIfPresent(String.self, forKey: .source) ?? ""
|
||
self.inputTokens = try container.decodeIfPresent(UInt64.self, forKey: .inputTokens) ?? nil
|
||
self.outputTokens = try container.decodeIfPresent(UInt64.self, forKey: .outputTokens) ?? nil
|
||
self.totalTokens = try container.decodeIfPresent(UInt64.self, forKey: .totalTokens) ?? nil
|
||
self.estimatedCost = try container.decodeIfPresent(Double.self, forKey: .estimatedCost) ?? nil
|
||
self.finishReason = try container.decodeIfPresent(String.self, forKey: .finishReason) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LlmUsage
|
||
internal extension LlmUsage {
|
||
init(_ rb: RustBridge.LlmUsageRef) throws {
|
||
self.model = rb.model().toString()
|
||
self.source = rb.source().toString()
|
||
self.inputTokens = rb.inputTokens()
|
||
self.outputTokens = rb.outputTokens()
|
||
self.totalTokens = rb.totalTokens()
|
||
self.estimatedCost = rb.estimatedCost()
|
||
self.finishReason = rb.finishReason()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.LlmUsage {
|
||
return RustBridge.LlmUsage(RustString(self.model), RustString(self.source), self.inputTokens, self.outputTokens, self.totalTokens, self.estimatedCost, self.finishReason.map(RustString.init))
|
||
}
|
||
}
|
||
|
||
/// A text chunk with optional embedding and metadata.
|
||
///
|
||
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
||
/// contains the text content, optional embedding vector (if embedding generation
|
||
/// is configured), and metadata about its position in the document.
|
||
public struct Chunk: Codable, Sendable, Hashable {
|
||
/// The text content of this chunk.
|
||
public let content: String
|
||
/// Semantic structural classification of this chunk.
|
||
///
|
||
/// Assigned by the heuristic classifier based on content patterns and
|
||
/// heading context. Defaults to `ChunkType::Unknown` when no rule matches.
|
||
public let chunkType: ChunkType
|
||
/// Optional embedding vector for this chunk.
|
||
///
|
||
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
||
/// The dimensionality depends on the chosen embedding model.
|
||
public let embedding: [Float]?
|
||
/// Metadata about this chunk's position and properties.
|
||
public let metadata: ChunkMetadata
|
||
public init(content: String, chunkType: ChunkType, embedding: [Float]? = nil, metadata: ChunkMetadata) {
|
||
self.content = content
|
||
self.chunkType = chunkType
|
||
self.embedding = embedding
|
||
self.metadata = metadata
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case chunkType = "chunk_type"
|
||
case embedding = "embedding"
|
||
case metadata = "metadata"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for Chunk
|
||
internal extension Chunk {
|
||
init(_ rb: RustBridge.ChunkRef) throws {
|
||
self.content = rb.content().toString()
|
||
self.chunkType = ChunkType(rawValue: rb.chunkType().toString()) ?? { fatalError("Unknown ChunkType: \(rb.chunkType().toString())") }()
|
||
self.embedding = rb.embedding().map { Array($0) }
|
||
self.metadata = try ChunkMetadata(rb.metadata())
|
||
}
|
||
func intoRust() throws -> RustBridge.Chunk {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.chunkFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Heading context for a chunk within a Markdown document.
|
||
///
|
||
/// Contains the heading hierarchy from document root to this chunk's section.
|
||
public struct HeadingContext: Codable, Sendable, Hashable {
|
||
/// The heading hierarchy from document root to this chunk's section.
|
||
/// Index 0 is the outermost (h1), last element is the most specific.
|
||
public let headings: [HeadingLevel]
|
||
public init(headings: [HeadingLevel]) {
|
||
self.headings = headings
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for HeadingContext
|
||
internal extension HeadingContext {
|
||
init(_ rb: RustBridge.HeadingContextRef) throws {
|
||
self.headings = try rb.headings().map { try HeadingLevel($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.HeadingContext {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.headingContextFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A single heading in the hierarchy.
|
||
public struct HeadingLevel: Codable, Sendable, Hashable {
|
||
/// Heading depth (1 = h1, 2 = h2, etc.)
|
||
public let level: UInt8
|
||
/// The text content of the heading.
|
||
public let text: String
|
||
public init(level: UInt8, text: String) {
|
||
self.level = level
|
||
self.text = text
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for HeadingLevel
|
||
internal extension HeadingLevel {
|
||
init(_ rb: RustBridge.HeadingLevelRef) throws {
|
||
self.level = rb.level()
|
||
self.text = rb.text().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.HeadingLevel {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.headingLevelFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Metadata about a chunk's position in the original document.
|
||
public struct ChunkMetadata: Codable, Sendable, Hashable {
|
||
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
||
public let byteStart: UInt
|
||
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
||
public let byteEnd: UInt
|
||
/// Number of tokens in this chunk (if available).
|
||
///
|
||
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
||
public let tokenCount: UInt?
|
||
/// Zero-based index of this chunk in the document.
|
||
public let chunkIndex: UInt
|
||
/// Total number of chunks in the document.
|
||
public let totalChunks: UInt
|
||
/// First page number this chunk spans (1-indexed).
|
||
///
|
||
/// Only populated when page tracking is enabled in extraction configuration.
|
||
public let firstPage: UInt32?
|
||
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
||
///
|
||
/// Only populated when page tracking is enabled in extraction configuration.
|
||
public let lastPage: UInt32?
|
||
/// Heading context when using Markdown chunker.
|
||
///
|
||
/// Contains the heading hierarchy this chunk falls under.
|
||
/// Only populated when `ChunkerType::Markdown` is used.
|
||
public let headingContext: HeadingContext?
|
||
/// Indices into `ExtractionResult.images` for images on pages covered by this chunk.
|
||
///
|
||
/// Contains zero-based indices into the top-level `images` collection for every
|
||
/// image whose `page_number` falls within `[first_page, last_page]`.
|
||
/// Empty when image extraction is disabled or the chunk spans no pages with images.
|
||
public let imageIndices: [UInt32]
|
||
public init(byteStart: UInt, byteEnd: UInt, tokenCount: UInt? = nil, chunkIndex: UInt, totalChunks: UInt, firstPage: UInt32? = nil, lastPage: UInt32? = nil, headingContext: HeadingContext? = nil, imageIndices: [UInt32]) {
|
||
self.byteStart = byteStart
|
||
self.byteEnd = byteEnd
|
||
self.tokenCount = tokenCount
|
||
self.chunkIndex = chunkIndex
|
||
self.totalChunks = totalChunks
|
||
self.firstPage = firstPage
|
||
self.lastPage = lastPage
|
||
self.headingContext = headingContext
|
||
self.imageIndices = imageIndices
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case byteStart = "byte_start"
|
||
case byteEnd = "byte_end"
|
||
case tokenCount = "token_count"
|
||
case chunkIndex = "chunk_index"
|
||
case totalChunks = "total_chunks"
|
||
case firstPage = "first_page"
|
||
case lastPage = "last_page"
|
||
case headingContext = "heading_context"
|
||
case imageIndices = "image_indices"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ChunkMetadata
|
||
internal extension ChunkMetadata {
|
||
init(_ rb: RustBridge.ChunkMetadataRef) throws {
|
||
self.byteStart = rb.byteStart()
|
||
self.byteEnd = rb.byteEnd()
|
||
self.tokenCount = rb.tokenCount()
|
||
self.chunkIndex = rb.chunkIndex()
|
||
self.totalChunks = rb.totalChunks()
|
||
self.firstPage = rb.firstPage()
|
||
self.lastPage = rb.lastPage()
|
||
self.headingContext = try rb.headingContext().map { try HeadingContext($0) }
|
||
self.imageIndices = Array(rb.imageIndices())
|
||
}
|
||
func intoRust() throws -> RustBridge.ChunkMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.chunkMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Extracted image from a document.
|
||
///
|
||
/// Contains raw image data, metadata, and optional nested OCR results.
|
||
/// Raw bytes allow cross-language compatibility - users can convert to
|
||
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
||
public typealias ExtractedImage = RustBridge.ExtractedImage
|
||
|
||
/// Bounding box coordinates for element positioning.
|
||
public struct BoundingBox: Codable, Sendable, Hashable {
|
||
/// Left x-coordinate
|
||
public let x0: Double
|
||
/// Bottom y-coordinate
|
||
public let y0: Double
|
||
/// Right x-coordinate
|
||
public let x1: Double
|
||
/// Top y-coordinate
|
||
public let y1: Double
|
||
public init(x0: Double, y0: Double, x1: Double, y1: Double) {
|
||
self.x0 = x0
|
||
self.y0 = y0
|
||
self.x1 = x1
|
||
self.y1 = y1
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case x0 = "x0"
|
||
case y0 = "y0"
|
||
case x1 = "x1"
|
||
case y1 = "y1"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.x0 = try container.decodeIfPresent(Double.self, forKey: .x0) ?? 0
|
||
self.y0 = try container.decodeIfPresent(Double.self, forKey: .y0) ?? 0
|
||
self.x1 = try container.decodeIfPresent(Double.self, forKey: .x1) ?? 0
|
||
self.y1 = try container.decodeIfPresent(Double.self, forKey: .y1) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for BoundingBox
|
||
internal extension BoundingBox {
|
||
init(_ rb: RustBridge.BoundingBoxRef) throws {
|
||
self.x0 = rb.x0()
|
||
self.y0 = rb.y0()
|
||
self.x1 = rb.x1()
|
||
self.y1 = rb.y1()
|
||
}
|
||
func intoRust() throws -> RustBridge.BoundingBox {
|
||
return RustBridge.BoundingBox(self.x0, self.y0, self.x1, self.y1)
|
||
}
|
||
}
|
||
|
||
/// Metadata for a semantic element.
|
||
public typealias ElementMetadata = RustBridge.ElementMetadata
|
||
|
||
/// Semantic element extracted from document.
|
||
///
|
||
/// Represents a logical unit of content with semantic classification,
|
||
/// unique identifier, and metadata for tracking origin and position.
|
||
public typealias Element = RustBridge.Element
|
||
|
||
/// Excel workbook representation.
|
||
///
|
||
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
||
/// extracted content and metadata.
|
||
public typealias ExcelWorkbook = RustBridge.ExcelWorkbook
|
||
|
||
/// Single Excel worksheet.
|
||
///
|
||
/// Represents one sheet from an Excel workbook with its content
|
||
/// converted to Markdown format and dimensional statistics.
|
||
public struct ExcelSheet: Codable, Sendable, Hashable {
|
||
/// Sheet name as it appears in Excel
|
||
public let name: String
|
||
/// Sheet content converted to Markdown tables
|
||
public let markdown: String
|
||
/// Number of rows
|
||
public let rowCount: UInt
|
||
/// Number of columns
|
||
public let colCount: UInt
|
||
/// Total number of non-empty cells
|
||
public let cellCount: UInt
|
||
/// Pre-extracted table cells (2D vector of cell values)
|
||
/// Populated during markdown generation to avoid re-parsing markdown.
|
||
/// None for empty sheets.
|
||
public let tableCells: [[String]]?
|
||
public init(name: String, markdown: String, rowCount: UInt, colCount: UInt, cellCount: UInt, tableCells: [[String]]? = nil) {
|
||
self.name = name
|
||
self.markdown = markdown
|
||
self.rowCount = rowCount
|
||
self.colCount = colCount
|
||
self.cellCount = cellCount
|
||
self.tableCells = tableCells
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case name = "name"
|
||
case markdown = "markdown"
|
||
case rowCount = "row_count"
|
||
case colCount = "col_count"
|
||
case cellCount = "cell_count"
|
||
case tableCells = "table_cells"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ExcelSheet
|
||
internal extension ExcelSheet {
|
||
init(_ rb: RustBridge.ExcelSheetRef) throws {
|
||
self.name = rb.name().toString()
|
||
self.markdown = rb.markdown().toString()
|
||
self.rowCount = rb.rowCount()
|
||
self.colCount = rb.colCount()
|
||
self.cellCount = rb.cellCount()
|
||
self.tableCells = try JSONDecoder().decode([[String]]?.self, from: ((rb.tableCells()?.toString() ?? "null").data(using: .utf8) ?? Data("null".utf8)))
|
||
}
|
||
func intoRust() throws -> RustBridge.ExcelSheet {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.excelSheetFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// XML extraction result.
|
||
///
|
||
/// Contains extracted text content from XML files along with
|
||
/// structural statistics about the XML document.
|
||
public struct XmlExtractionResult: Codable, Sendable, Hashable {
|
||
/// Extracted text content (XML structure filtered out)
|
||
public let content: String
|
||
/// Total number of XML elements processed
|
||
public let elementCount: UInt
|
||
/// List of unique element names found (sorted)
|
||
public let uniqueElements: [String]
|
||
public init(content: String, elementCount: UInt, uniqueElements: [String]) {
|
||
self.content = content
|
||
self.elementCount = elementCount
|
||
self.uniqueElements = uniqueElements
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case elementCount = "element_count"
|
||
case uniqueElements = "unique_elements"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for XmlExtractionResult
|
||
internal extension XmlExtractionResult {
|
||
init(_ rb: RustBridge.XmlExtractionResultRef) throws {
|
||
self.content = rb.content().toString()
|
||
self.elementCount = rb.elementCount()
|
||
self.uniqueElements = rb.uniqueElements().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.XmlExtractionResult {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.xmlExtractionResultFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Plain text and Markdown extraction result.
|
||
///
|
||
/// Contains the extracted text along with statistics and,
|
||
/// for Markdown files, structural elements like headers and links.
|
||
public struct TextExtractionResult: Codable, Sendable, Hashable {
|
||
/// Extracted text content
|
||
public let content: String
|
||
/// Number of lines
|
||
public let lineCount: UInt
|
||
/// Number of words
|
||
public let wordCount: UInt
|
||
/// Number of characters
|
||
public let characterCount: UInt
|
||
/// Markdown headers (text only, Markdown files only)
|
||
public let headers: [String]?
|
||
/// Markdown links as (text, URL) tuples (Markdown files only)
|
||
public let links: [[String]]?
|
||
/// Code blocks as (language, code) tuples (Markdown files only)
|
||
public let codeBlocks: [[String]]?
|
||
public init(content: String, lineCount: UInt, wordCount: UInt, characterCount: UInt, headers: [String]? = nil, links: [[String]]? = nil, codeBlocks: [[String]]? = nil) {
|
||
self.content = content
|
||
self.lineCount = lineCount
|
||
self.wordCount = wordCount
|
||
self.characterCount = characterCount
|
||
self.headers = headers
|
||
self.links = links
|
||
self.codeBlocks = codeBlocks
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case lineCount = "line_count"
|
||
case wordCount = "word_count"
|
||
case characterCount = "character_count"
|
||
case headers = "headers"
|
||
case links = "links"
|
||
case codeBlocks = "code_blocks"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TextExtractionResult
|
||
internal extension TextExtractionResult {
|
||
init(_ rb: RustBridge.TextExtractionResultRef) throws {
|
||
self.content = rb.content().toString()
|
||
self.lineCount = rb.lineCount()
|
||
self.wordCount = rb.wordCount()
|
||
self.characterCount = rb.characterCount()
|
||
self.headers = rb.headers()?.map { $0.as_str().toString() }
|
||
self.links = nil
|
||
self.codeBlocks = nil
|
||
}
|
||
func intoRust() throws -> RustBridge.TextExtractionResult {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.textExtractionResultFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// PowerPoint (PPTX) extraction result.
|
||
///
|
||
/// Contains extracted slide content, metadata, and embedded images/tables.
|
||
public typealias PptxExtractionResult = RustBridge.PptxExtractionResult
|
||
|
||
/// Email extraction result.
|
||
///
|
||
/// Complete representation of an extracted email message (.eml or .msg)
|
||
/// including headers, body content, and attachments.
|
||
public typealias EmailExtractionResult = RustBridge.EmailExtractionResult
|
||
|
||
/// Email attachment representation.
|
||
///
|
||
/// Contains metadata and optionally the content of an email attachment.
|
||
public typealias EmailAttachment = RustBridge.EmailAttachment
|
||
|
||
/// OCR extraction result.
|
||
///
|
||
/// Result of performing OCR on an image or scanned document,
|
||
/// including recognized text and detected tables.
|
||
public typealias OcrExtractionResult = RustBridge.OcrExtractionResult
|
||
|
||
/// Table detected via OCR.
|
||
///
|
||
/// Represents a table structure recognized during OCR processing.
|
||
public struct OcrTable: Codable, Sendable, Hashable {
|
||
/// Table cells as a 2D vector (rows × columns)
|
||
public let cells: [[String]]
|
||
/// Markdown representation of the table
|
||
public let markdown: String
|
||
/// Page number where the table was found (1-indexed)
|
||
public let pageNumber: UInt32
|
||
/// Bounding box of the table in pixel coordinates (from OCR word positions).
|
||
public let boundingBox: OcrTableBoundingBox?
|
||
public init(cells: [[String]], markdown: String, pageNumber: UInt32, boundingBox: OcrTableBoundingBox? = nil) {
|
||
self.cells = cells
|
||
self.markdown = markdown
|
||
self.pageNumber = pageNumber
|
||
self.boundingBox = boundingBox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case cells = "cells"
|
||
case markdown = "markdown"
|
||
case pageNumber = "page_number"
|
||
case boundingBox = "bounding_box"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrTable
|
||
internal extension OcrTable {
|
||
init(_ rb: RustBridge.OcrTableRef) throws {
|
||
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
|
||
self.markdown = rb.markdown().toString()
|
||
self.pageNumber = rb.pageNumber()
|
||
self.boundingBox = try rb.boundingBox().map { try OcrTableBoundingBox($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrTable {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.ocrTableFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Bounding box for an OCR-detected table in pixel coordinates.
|
||
public struct OcrTableBoundingBox: Codable, Sendable, Hashable {
|
||
/// Left x-coordinate (pixels)
|
||
public let left: UInt32
|
||
/// Top y-coordinate (pixels)
|
||
public let top: UInt32
|
||
/// Right x-coordinate (pixels)
|
||
public let right: UInt32
|
||
/// Bottom y-coordinate (pixels)
|
||
public let bottom: UInt32
|
||
public init(left: UInt32, top: UInt32, right: UInt32, bottom: UInt32) {
|
||
self.left = left
|
||
self.top = top
|
||
self.right = right
|
||
self.bottom = bottom
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrTableBoundingBox
|
||
internal extension OcrTableBoundingBox {
|
||
init(_ rb: RustBridge.OcrTableBoundingBoxRef) throws {
|
||
self.left = rb.left()
|
||
self.top = rb.top()
|
||
self.right = rb.right()
|
||
self.bottom = rb.bottom()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrTableBoundingBox {
|
||
return RustBridge.OcrTableBoundingBox(self.left, self.top, self.right, self.bottom)
|
||
}
|
||
}
|
||
|
||
/// Image preprocessing configuration for OCR.
|
||
///
|
||
/// These settings control how images are preprocessed before OCR to improve
|
||
/// text recognition quality. Different preprocessing strategies work better
|
||
/// for different document types.
|
||
public struct ImagePreprocessingConfig: Codable, Sendable, Hashable {
|
||
/// Target DPI for the image (300 is standard, 600 for small text).
|
||
public let targetDpi: Int32
|
||
/// Auto-detect and correct image rotation.
|
||
public let autoRotate: Bool
|
||
/// Correct skew (tilted images).
|
||
public let deskew: Bool
|
||
/// Remove noise from the image.
|
||
public let denoise: Bool
|
||
/// Enhance contrast for better text visibility.
|
||
public let contrastEnhance: Bool
|
||
/// Binarization method: "otsu", "sauvola", "adaptive".
|
||
public let binarizationMethod: String
|
||
/// Invert colors (white text on black → black on white).
|
||
public let invertColors: Bool
|
||
public init(targetDpi: Int32, autoRotate: Bool, deskew: Bool, denoise: Bool, contrastEnhance: Bool, binarizationMethod: String, invertColors: Bool) {
|
||
self.targetDpi = targetDpi
|
||
self.autoRotate = autoRotate
|
||
self.deskew = deskew
|
||
self.denoise = denoise
|
||
self.contrastEnhance = contrastEnhance
|
||
self.binarizationMethod = binarizationMethod
|
||
self.invertColors = invertColors
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case targetDpi = "target_dpi"
|
||
case autoRotate = "auto_rotate"
|
||
case deskew = "deskew"
|
||
case denoise = "denoise"
|
||
case contrastEnhance = "contrast_enhance"
|
||
case binarizationMethod = "binarization_method"
|
||
case invertColors = "invert_colors"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.targetDpi = try container.decodeIfPresent(Int32.self, forKey: .targetDpi) ?? 300
|
||
self.autoRotate = try container.decodeIfPresent(Bool.self, forKey: .autoRotate) ?? true
|
||
self.deskew = try container.decodeIfPresent(Bool.self, forKey: .deskew) ?? true
|
||
self.denoise = try container.decodeIfPresent(Bool.self, forKey: .denoise) ?? false
|
||
self.contrastEnhance = try container.decodeIfPresent(Bool.self, forKey: .contrastEnhance) ?? false
|
||
self.binarizationMethod = try container.decodeIfPresent(String.self, forKey: .binarizationMethod) ?? "otsu"
|
||
self.invertColors = try container.decodeIfPresent(Bool.self, forKey: .invertColors) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ImagePreprocessingConfig
|
||
internal extension ImagePreprocessingConfig {
|
||
init(_ rb: RustBridge.ImagePreprocessingConfigRef) throws {
|
||
self.targetDpi = rb.targetDpi()
|
||
self.autoRotate = rb.autoRotate()
|
||
self.deskew = rb.deskew()
|
||
self.denoise = rb.denoise()
|
||
self.contrastEnhance = rb.contrastEnhance()
|
||
self.binarizationMethod = rb.binarizationMethod().toString()
|
||
self.invertColors = rb.invertColors()
|
||
}
|
||
func intoRust() throws -> RustBridge.ImagePreprocessingConfig {
|
||
return RustBridge.ImagePreprocessingConfig(self.targetDpi, self.autoRotate, self.deskew, self.denoise, self.contrastEnhance, RustString(self.binarizationMethod), self.invertColors)
|
||
}
|
||
}
|
||
|
||
/// Tesseract OCR configuration.
|
||
///
|
||
/// Provides fine-grained control over Tesseract OCR engine parameters.
|
||
/// Most users can use the defaults, but these settings allow optimization
|
||
/// for specific document types (invoices, handwriting, etc.).
|
||
public struct TesseractConfig: Codable, Sendable, Hashable {
|
||
/// Language code (e.g., "eng", "deu", "fra")
|
||
public let language: String
|
||
/// Page Segmentation Mode (0-13).
|
||
///
|
||
/// Common values:
|
||
/// - 3: Fully automatic page segmentation (native default)
|
||
/// - 6: Assume a single uniform block of text (WASM default — avoids layout-analysis hang)
|
||
/// - 11: Sparse text with no particular order
|
||
public let psm: Int32
|
||
/// Output format ("text" or "markdown")
|
||
public let outputFormat: String
|
||
/// OCR Engine Mode (0-3).
|
||
///
|
||
/// - 0: Legacy engine only
|
||
/// - 1: Neural nets (LSTM) only (usually best)
|
||
/// - 2: Legacy + LSTM
|
||
/// - 3: Default (based on what's available)
|
||
public let oem: Int32
|
||
/// Minimum confidence threshold (0.0-100.0).
|
||
///
|
||
/// Words with confidence below this threshold may be rejected or flagged.
|
||
public let minConfidence: Double
|
||
/// Image preprocessing configuration.
|
||
///
|
||
/// Controls how images are preprocessed before OCR. Can significantly
|
||
/// improve quality for scanned documents or low-quality images.
|
||
public let preprocessing: ImagePreprocessingConfig?
|
||
/// Enable automatic table detection and reconstruction
|
||
public let enableTableDetection: Bool
|
||
/// Minimum confidence threshold for table detection (0.0-1.0)
|
||
public let tableMinConfidence: Double
|
||
/// Column threshold for table detection (pixels)
|
||
public let tableColumnThreshold: Int32
|
||
/// Row threshold ratio for table detection (0.0-1.0)
|
||
public let tableRowThresholdRatio: Double
|
||
/// Enable OCR result caching
|
||
public let useCache: Bool
|
||
/// Use pre-adapted templates for character classification
|
||
public let classifyUsePreAdaptedTemplates: Bool
|
||
/// Enable N-gram language model
|
||
public let languageModelNgramOn: Bool
|
||
/// Don't reject good words during block-level processing
|
||
public let tesseditDontBlkrejGoodWds: Bool
|
||
/// Don't reject good words during row-level processing
|
||
public let tesseditDontRowrejGoodWds: Bool
|
||
/// Enable dictionary correction
|
||
public let tesseditEnableDictCorrection: Bool
|
||
/// Whitelist of allowed characters (empty = all allowed)
|
||
public let tesseditCharWhitelist: String
|
||
/// Blacklist of forbidden characters (empty = none forbidden)
|
||
public let tesseditCharBlacklist: String
|
||
/// Use primary language params model
|
||
public let tesseditUsePrimaryParamsModel: Bool
|
||
/// Variable-width space detection
|
||
public let textordSpaceSizeIsVariable: Bool
|
||
/// Use adaptive thresholding method
|
||
public let thresholdingMethod: Bool
|
||
public init(language: String, psm: Int32, outputFormat: String, oem: Int32, minConfidence: Double, preprocessing: ImagePreprocessingConfig? = nil, enableTableDetection: Bool, tableMinConfidence: Double, tableColumnThreshold: Int32, tableRowThresholdRatio: Double, useCache: Bool, classifyUsePreAdaptedTemplates: Bool, languageModelNgramOn: Bool, tesseditDontBlkrejGoodWds: Bool, tesseditDontRowrejGoodWds: Bool, tesseditEnableDictCorrection: Bool, tesseditCharWhitelist: String, tesseditCharBlacklist: String, tesseditUsePrimaryParamsModel: Bool, textordSpaceSizeIsVariable: Bool, thresholdingMethod: Bool) {
|
||
self.language = language
|
||
self.psm = psm
|
||
self.outputFormat = outputFormat
|
||
self.oem = oem
|
||
self.minConfidence = minConfidence
|
||
self.preprocessing = preprocessing
|
||
self.enableTableDetection = enableTableDetection
|
||
self.tableMinConfidence = tableMinConfidence
|
||
self.tableColumnThreshold = tableColumnThreshold
|
||
self.tableRowThresholdRatio = tableRowThresholdRatio
|
||
self.useCache = useCache
|
||
self.classifyUsePreAdaptedTemplates = classifyUsePreAdaptedTemplates
|
||
self.languageModelNgramOn = languageModelNgramOn
|
||
self.tesseditDontBlkrejGoodWds = tesseditDontBlkrejGoodWds
|
||
self.tesseditDontRowrejGoodWds = tesseditDontRowrejGoodWds
|
||
self.tesseditEnableDictCorrection = tesseditEnableDictCorrection
|
||
self.tesseditCharWhitelist = tesseditCharWhitelist
|
||
self.tesseditCharBlacklist = tesseditCharBlacklist
|
||
self.tesseditUsePrimaryParamsModel = tesseditUsePrimaryParamsModel
|
||
self.textordSpaceSizeIsVariable = textordSpaceSizeIsVariable
|
||
self.thresholdingMethod = thresholdingMethod
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case language = "language"
|
||
case psm = "psm"
|
||
case outputFormat = "output_format"
|
||
case oem = "oem"
|
||
case minConfidence = "min_confidence"
|
||
case preprocessing = "preprocessing"
|
||
case enableTableDetection = "enable_table_detection"
|
||
case tableMinConfidence = "table_min_confidence"
|
||
case tableColumnThreshold = "table_column_threshold"
|
||
case tableRowThresholdRatio = "table_row_threshold_ratio"
|
||
case useCache = "use_cache"
|
||
case classifyUsePreAdaptedTemplates = "classify_use_pre_adapted_templates"
|
||
case languageModelNgramOn = "language_model_ngram_on"
|
||
case tesseditDontBlkrejGoodWds = "tessedit_dont_blkrej_good_wds"
|
||
case tesseditDontRowrejGoodWds = "tessedit_dont_rowrej_good_wds"
|
||
case tesseditEnableDictCorrection = "tessedit_enable_dict_correction"
|
||
case tesseditCharWhitelist = "tessedit_char_whitelist"
|
||
case tesseditCharBlacklist = "tessedit_char_blacklist"
|
||
case tesseditUsePrimaryParamsModel = "tessedit_use_primary_params_model"
|
||
case textordSpaceSizeIsVariable = "textord_space_size_is_variable"
|
||
case thresholdingMethod = "thresholding_method"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? "eng"
|
||
self.psm = try container.decodeIfPresent(Int32.self, forKey: .psm) ?? 3
|
||
self.outputFormat = try container.decodeIfPresent(String.self, forKey: .outputFormat) ?? "markdown"
|
||
self.oem = try container.decodeIfPresent(Int32.self, forKey: .oem) ?? 3
|
||
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0.0
|
||
self.preprocessing = try container.decodeIfPresent(ImagePreprocessingConfig.self, forKey: .preprocessing) ?? nil
|
||
self.enableTableDetection = try container.decodeIfPresent(Bool.self, forKey: .enableTableDetection) ?? true
|
||
self.tableMinConfidence = try container.decodeIfPresent(Double.self, forKey: .tableMinConfidence) ?? 0.0
|
||
self.tableColumnThreshold = try container.decodeIfPresent(Int32.self, forKey: .tableColumnThreshold) ?? 50
|
||
self.tableRowThresholdRatio = try container.decodeIfPresent(Double.self, forKey: .tableRowThresholdRatio) ?? 0.5
|
||
self.useCache = try container.decodeIfPresent(Bool.self, forKey: .useCache) ?? true
|
||
self.classifyUsePreAdaptedTemplates = try container.decodeIfPresent(Bool.self, forKey: .classifyUsePreAdaptedTemplates) ?? true
|
||
self.languageModelNgramOn = try container.decodeIfPresent(Bool.self, forKey: .languageModelNgramOn) ?? false
|
||
self.tesseditDontBlkrejGoodWds = try container.decodeIfPresent(Bool.self, forKey: .tesseditDontBlkrejGoodWds) ?? true
|
||
self.tesseditDontRowrejGoodWds = try container.decodeIfPresent(Bool.self, forKey: .tesseditDontRowrejGoodWds) ?? true
|
||
self.tesseditEnableDictCorrection = try container.decodeIfPresent(Bool.self, forKey: .tesseditEnableDictCorrection) ?? true
|
||
self.tesseditCharWhitelist = try container.decodeIfPresent(String.self, forKey: .tesseditCharWhitelist) ?? ""
|
||
self.tesseditCharBlacklist = try container.decodeIfPresent(String.self, forKey: .tesseditCharBlacklist) ?? ""
|
||
self.tesseditUsePrimaryParamsModel = try container.decodeIfPresent(Bool.self, forKey: .tesseditUsePrimaryParamsModel) ?? true
|
||
self.textordSpaceSizeIsVariable = try container.decodeIfPresent(Bool.self, forKey: .textordSpaceSizeIsVariable) ?? true
|
||
self.thresholdingMethod = try container.decodeIfPresent(Bool.self, forKey: .thresholdingMethod) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TesseractConfig
|
||
internal extension TesseractConfig {
|
||
init(_ rb: RustBridge.TesseractConfigRef) throws {
|
||
self.language = rb.language().toString()
|
||
self.psm = rb.psm()
|
||
self.outputFormat = rb.outputFormat().toString()
|
||
self.oem = rb.oem()
|
||
self.minConfidence = rb.minConfidence()
|
||
self.preprocessing = try rb.preprocessing().map { try ImagePreprocessingConfig($0) }
|
||
self.enableTableDetection = rb.enableTableDetection()
|
||
self.tableMinConfidence = rb.tableMinConfidence()
|
||
self.tableColumnThreshold = rb.tableColumnThreshold()
|
||
self.tableRowThresholdRatio = rb.tableRowThresholdRatio()
|
||
self.useCache = rb.useCache()
|
||
self.classifyUsePreAdaptedTemplates = rb.classifyUsePreAdaptedTemplates()
|
||
self.languageModelNgramOn = rb.languageModelNgramOn()
|
||
self.tesseditDontBlkrejGoodWds = rb.tesseditDontBlkrejGoodWds()
|
||
self.tesseditDontRowrejGoodWds = rb.tesseditDontRowrejGoodWds()
|
||
self.tesseditEnableDictCorrection = rb.tesseditEnableDictCorrection()
|
||
self.tesseditCharWhitelist = rb.tesseditCharWhitelist().toString()
|
||
self.tesseditCharBlacklist = rb.tesseditCharBlacklist().toString()
|
||
self.tesseditUsePrimaryParamsModel = rb.tesseditUsePrimaryParamsModel()
|
||
self.textordSpaceSizeIsVariable = rb.textordSpaceSizeIsVariable()
|
||
self.thresholdingMethod = rb.thresholdingMethod()
|
||
}
|
||
func intoRust() throws -> RustBridge.TesseractConfig {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.tesseractConfigFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Image preprocessing metadata.
|
||
///
|
||
/// Tracks the transformations applied to an image during OCR preprocessing,
|
||
/// including DPI normalization, resizing, and resampling.
|
||
public struct ImagePreprocessingMetadata: Codable, Sendable, Hashable {
|
||
/// Original image dimensions (width, height) in pixels
|
||
public let originalDimensions: [UInt]
|
||
/// Original image DPI (horizontal, vertical)
|
||
public let originalDpi: [Double]
|
||
/// Target DPI from configuration
|
||
public let targetDpi: Int32
|
||
/// Scaling factor applied to the image
|
||
public let scaleFactor: Double
|
||
/// Whether DPI was auto-adjusted based on content
|
||
public let autoAdjusted: Bool
|
||
/// Final DPI after processing
|
||
public let finalDpi: Int32
|
||
/// New dimensions after resizing (if resized)
|
||
public let newDimensions: [UInt]?
|
||
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
||
public let resampleMethod: String
|
||
/// Whether dimensions were clamped to max_image_dimension
|
||
public let dimensionClamped: Bool
|
||
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
||
public let calculatedDpi: Int32?
|
||
/// Whether resize was skipped (dimensions already optimal)
|
||
public let skippedResize: Bool
|
||
/// Error message if resize failed
|
||
public let resizeError: String?
|
||
public init(originalDimensions: [UInt], originalDpi: [Double], targetDpi: Int32, scaleFactor: Double, autoAdjusted: Bool, finalDpi: Int32, newDimensions: [UInt]? = nil, resampleMethod: String, dimensionClamped: Bool, calculatedDpi: Int32? = nil, skippedResize: Bool, resizeError: String? = nil) {
|
||
self.originalDimensions = originalDimensions
|
||
self.originalDpi = originalDpi
|
||
self.targetDpi = targetDpi
|
||
self.scaleFactor = scaleFactor
|
||
self.autoAdjusted = autoAdjusted
|
||
self.finalDpi = finalDpi
|
||
self.newDimensions = newDimensions
|
||
self.resampleMethod = resampleMethod
|
||
self.dimensionClamped = dimensionClamped
|
||
self.calculatedDpi = calculatedDpi
|
||
self.skippedResize = skippedResize
|
||
self.resizeError = resizeError
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case originalDimensions = "original_dimensions"
|
||
case originalDpi = "original_dpi"
|
||
case targetDpi = "target_dpi"
|
||
case scaleFactor = "scale_factor"
|
||
case autoAdjusted = "auto_adjusted"
|
||
case finalDpi = "final_dpi"
|
||
case newDimensions = "new_dimensions"
|
||
case resampleMethod = "resample_method"
|
||
case dimensionClamped = "dimension_clamped"
|
||
case calculatedDpi = "calculated_dpi"
|
||
case skippedResize = "skipped_resize"
|
||
case resizeError = "resize_error"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ImagePreprocessingMetadata
|
||
internal extension ImagePreprocessingMetadata {
|
||
init(_ rb: RustBridge.ImagePreprocessingMetadataRef) throws {
|
||
self.originalDimensions = Array(rb.originalDimensions())
|
||
self.originalDpi = Array(rb.originalDpi())
|
||
self.targetDpi = rb.targetDpi()
|
||
self.scaleFactor = rb.scaleFactor()
|
||
self.autoAdjusted = rb.autoAdjusted()
|
||
self.finalDpi = rb.finalDpi()
|
||
self.newDimensions = rb.newDimensions().map { Array($0) }
|
||
self.resampleMethod = rb.resampleMethod().toString()
|
||
self.dimensionClamped = rb.dimensionClamped()
|
||
self.calculatedDpi = rb.calculatedDpi()
|
||
self.skippedResize = rb.skippedResize()
|
||
self.resizeError = rb.resizeError()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.ImagePreprocessingMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.imagePreprocessingMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Extraction result metadata.
|
||
///
|
||
/// Contains common fields applicable to all formats, format-specific metadata
|
||
/// via a discriminated union, and additional custom fields from postprocessors.
|
||
public typealias Metadata = RustBridge.Metadata
|
||
|
||
/// Excel/spreadsheet format metadata.
|
||
///
|
||
/// Identifies the document as a spreadsheet source via the `FormatMetadata::Excel`
|
||
/// discriminant. Sheet count and sheet names are stored inside this struct.
|
||
public struct ExcelMetadata: Codable, Sendable, Hashable {
|
||
/// Number of sheets in the workbook.
|
||
public let sheetCount: UInt32?
|
||
/// Names of all sheets in the workbook.
|
||
public let sheetNames: [String]?
|
||
public init(sheetCount: UInt32? = nil, sheetNames: [String]? = nil) {
|
||
self.sheetCount = sheetCount
|
||
self.sheetNames = sheetNames
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case sheetCount = "sheet_count"
|
||
case sheetNames = "sheet_names"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.sheetCount = try container.decodeIfPresent(UInt32.self, forKey: .sheetCount) ?? nil
|
||
self.sheetNames = try container.decodeIfPresent([String].self, forKey: .sheetNames) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ExcelMetadata
|
||
internal extension ExcelMetadata {
|
||
init(_ rb: RustBridge.ExcelMetadataRef) throws {
|
||
self.sheetCount = rb.sheetCount()
|
||
self.sheetNames = rb.sheetNames()?.map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.ExcelMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.excelMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Email metadata extracted from .eml and .msg files.
|
||
///
|
||
/// Includes sender/recipient information, message ID, and attachment list.
|
||
public struct EmailMetadata: Codable, Sendable, Hashable {
|
||
/// Sender's email address
|
||
public let fromEmail: String?
|
||
/// Sender's display name
|
||
public let fromName: String?
|
||
/// Primary recipients
|
||
public let toEmails: [String]
|
||
/// CC recipients
|
||
public let ccEmails: [String]
|
||
/// BCC recipients
|
||
public let bccEmails: [String]
|
||
/// Message-ID header value
|
||
public let messageId: String?
|
||
/// List of attachment filenames
|
||
public let attachments: [String]
|
||
public init(fromEmail: String? = nil, fromName: String? = nil, toEmails: [String], ccEmails: [String], bccEmails: [String], messageId: String? = nil, attachments: [String]) {
|
||
self.fromEmail = fromEmail
|
||
self.fromName = fromName
|
||
self.toEmails = toEmails
|
||
self.ccEmails = ccEmails
|
||
self.bccEmails = bccEmails
|
||
self.messageId = messageId
|
||
self.attachments = attachments
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case fromEmail = "from_email"
|
||
case fromName = "from_name"
|
||
case toEmails = "to_emails"
|
||
case ccEmails = "cc_emails"
|
||
case bccEmails = "bcc_emails"
|
||
case messageId = "message_id"
|
||
case attachments = "attachments"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.fromEmail = try container.decodeIfPresent(String.self, forKey: .fromEmail) ?? nil
|
||
self.fromName = try container.decodeIfPresent(String.self, forKey: .fromName) ?? nil
|
||
self.toEmails = try container.decodeIfPresent([String].self, forKey: .toEmails) ?? []
|
||
self.ccEmails = try container.decodeIfPresent([String].self, forKey: .ccEmails) ?? []
|
||
self.bccEmails = try container.decodeIfPresent([String].self, forKey: .bccEmails) ?? []
|
||
self.messageId = try container.decodeIfPresent(String.self, forKey: .messageId) ?? nil
|
||
self.attachments = try container.decodeIfPresent([String].self, forKey: .attachments) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for EmailMetadata
|
||
internal extension EmailMetadata {
|
||
init(_ rb: RustBridge.EmailMetadataRef) throws {
|
||
self.fromEmail = rb.fromEmail()?.toString()
|
||
self.fromName = rb.fromName()?.toString()
|
||
self.toEmails = rb.toEmails().map { $0.as_str().toString() }
|
||
self.ccEmails = rb.ccEmails().map { $0.as_str().toString() }
|
||
self.bccEmails = rb.bccEmails().map { $0.as_str().toString() }
|
||
self.messageId = rb.messageId()?.toString()
|
||
self.attachments = rb.attachments().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.EmailMetadata {
|
||
let __toEmails = RustVec<RustString>()
|
||
for __elem in self.toEmails { __toEmails.push(value: RustString(__elem)) }
|
||
let __ccEmails = RustVec<RustString>()
|
||
for __elem in self.ccEmails { __ccEmails.push(value: RustString(__elem)) }
|
||
let __bccEmails = RustVec<RustString>()
|
||
for __elem in self.bccEmails { __bccEmails.push(value: RustString(__elem)) }
|
||
let __attachments = RustVec<RustString>()
|
||
for __elem in self.attachments { __attachments.push(value: RustString(__elem)) }
|
||
return RustBridge.EmailMetadata(self.fromEmail.map(RustString.init), self.fromName.map(RustString.init), __toEmails, __ccEmails, __bccEmails, self.messageId.map(RustString.init), __attachments)
|
||
}
|
||
}
|
||
|
||
/// Archive (ZIP/TAR/7Z) metadata.
|
||
///
|
||
/// Extracted from compressed archive files containing file lists and size information.
|
||
public struct ArchiveMetadata: Codable, Sendable, Hashable {
|
||
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
||
public let format: String
|
||
/// Total number of files in the archive
|
||
public let fileCount: UInt32
|
||
/// List of file paths within the archive
|
||
public let fileList: [String]
|
||
/// Total uncompressed size in bytes
|
||
public let totalSize: UInt64
|
||
/// Compressed size in bytes (if available)
|
||
public let compressedSize: UInt64?
|
||
public init(format: String, fileCount: UInt32, fileList: [String], totalSize: UInt64, compressedSize: UInt64? = nil) {
|
||
self.format = format
|
||
self.fileCount = fileCount
|
||
self.fileList = fileList
|
||
self.totalSize = totalSize
|
||
self.compressedSize = compressedSize
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case format = "format"
|
||
case fileCount = "file_count"
|
||
case fileList = "file_list"
|
||
case totalSize = "total_size"
|
||
case compressedSize = "compressed_size"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.format = try container.decodeIfPresent(String.self, forKey: .format) ?? ""
|
||
self.fileCount = try container.decodeIfPresent(UInt32.self, forKey: .fileCount) ?? 0
|
||
self.fileList = try container.decodeIfPresent([String].self, forKey: .fileList) ?? []
|
||
self.totalSize = try container.decodeIfPresent(UInt64.self, forKey: .totalSize) ?? 0
|
||
self.compressedSize = try container.decodeIfPresent(UInt64.self, forKey: .compressedSize) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ArchiveMetadata
|
||
internal extension ArchiveMetadata {
|
||
init(_ rb: RustBridge.ArchiveMetadataRef) throws {
|
||
self.format = rb.format().toString()
|
||
self.fileCount = rb.fileCount()
|
||
self.fileList = rb.fileList().map { $0.as_str().toString() }
|
||
self.totalSize = rb.totalSize()
|
||
self.compressedSize = rb.compressedSize()
|
||
}
|
||
func intoRust() throws -> RustBridge.ArchiveMetadata {
|
||
let __fileList = RustVec<RustString>()
|
||
for __elem in self.fileList { __fileList.push(value: RustString(__elem)) }
|
||
return RustBridge.ArchiveMetadata(RustString(self.format), self.fileCount, __fileList, self.totalSize, self.compressedSize)
|
||
}
|
||
}
|
||
|
||
/// Image metadata extracted from image files.
|
||
///
|
||
/// Includes dimensions, format, and EXIF data.
|
||
public typealias ImageMetadata = RustBridge.ImageMetadata
|
||
|
||
/// XML metadata extracted during XML parsing.
|
||
///
|
||
/// Provides statistics about XML document structure.
|
||
public struct XmlMetadata: Codable, Sendable, Hashable {
|
||
/// Total number of XML elements processed
|
||
public let elementCount: UInt32
|
||
/// List of unique element tag names (sorted)
|
||
public let uniqueElements: [String]
|
||
public init(elementCount: UInt32, uniqueElements: [String]) {
|
||
self.elementCount = elementCount
|
||
self.uniqueElements = uniqueElements
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case elementCount = "element_count"
|
||
case uniqueElements = "unique_elements"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.elementCount = try container.decodeIfPresent(UInt32.self, forKey: .elementCount) ?? 0
|
||
self.uniqueElements = try container.decodeIfPresent([String].self, forKey: .uniqueElements) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for XmlMetadata
|
||
internal extension XmlMetadata {
|
||
init(_ rb: RustBridge.XmlMetadataRef) throws {
|
||
self.elementCount = rb.elementCount()
|
||
self.uniqueElements = rb.uniqueElements().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.XmlMetadata {
|
||
let __uniqueElements = RustVec<RustString>()
|
||
for __elem in self.uniqueElements { __uniqueElements.push(value: RustString(__elem)) }
|
||
return RustBridge.XmlMetadata(self.elementCount, __uniqueElements)
|
||
}
|
||
}
|
||
|
||
/// Text/Markdown metadata.
|
||
///
|
||
/// Extracted from plain text and Markdown files. Includes word counts and,
|
||
/// for Markdown, structural elements like headers and links.
|
||
public struct TextMetadata: Codable, Sendable, Hashable {
|
||
/// Number of lines in the document
|
||
public let lineCount: UInt32
|
||
/// Number of words
|
||
public let wordCount: UInt32
|
||
/// Number of characters
|
||
public let characterCount: UInt32
|
||
/// Markdown headers (headings text only, for Markdown files)
|
||
public let headers: [String]?
|
||
/// Markdown links as (text, url) tuples (for Markdown files)
|
||
public let links: [[String]]?
|
||
/// Code blocks as (language, code) tuples (for Markdown files)
|
||
public let codeBlocks: [[String]]?
|
||
public init(lineCount: UInt32, wordCount: UInt32, characterCount: UInt32, headers: [String]? = nil, links: [[String]]? = nil, codeBlocks: [[String]]? = nil) {
|
||
self.lineCount = lineCount
|
||
self.wordCount = wordCount
|
||
self.characterCount = characterCount
|
||
self.headers = headers
|
||
self.links = links
|
||
self.codeBlocks = codeBlocks
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case lineCount = "line_count"
|
||
case wordCount = "word_count"
|
||
case characterCount = "character_count"
|
||
case headers = "headers"
|
||
case links = "links"
|
||
case codeBlocks = "code_blocks"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.lineCount = try container.decodeIfPresent(UInt32.self, forKey: .lineCount) ?? 0
|
||
self.wordCount = try container.decodeIfPresent(UInt32.self, forKey: .wordCount) ?? 0
|
||
self.characterCount = try container.decodeIfPresent(UInt32.self, forKey: .characterCount) ?? 0
|
||
self.headers = try container.decodeIfPresent([String].self, forKey: .headers) ?? nil
|
||
self.links = try container.decodeIfPresent([[String]].self, forKey: .links) ?? nil
|
||
self.codeBlocks = try container.decodeIfPresent([[String]].self, forKey: .codeBlocks) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TextMetadata
|
||
internal extension TextMetadata {
|
||
init(_ rb: RustBridge.TextMetadataRef) throws {
|
||
self.lineCount = rb.lineCount()
|
||
self.wordCount = rb.wordCount()
|
||
self.characterCount = rb.characterCount()
|
||
self.headers = rb.headers()?.map { $0.as_str().toString() }
|
||
self.links = nil
|
||
self.codeBlocks = nil
|
||
}
|
||
func intoRust() throws -> RustBridge.TextMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.textMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Header/heading element metadata.
|
||
public struct HeaderMetadata: Codable, Sendable, Hashable {
|
||
/// Header level: 1 (h1) through 6 (h6)
|
||
public let level: UInt8
|
||
/// Normalized text content of the header
|
||
public let text: String
|
||
/// HTML id attribute if present
|
||
public let id: String?
|
||
/// Document tree depth at the header element
|
||
public let depth: UInt32
|
||
/// Byte offset in original HTML document
|
||
public let htmlOffset: UInt32
|
||
public init(level: UInt8, text: String, id: String? = nil, depth: UInt32, htmlOffset: UInt32) {
|
||
self.level = level
|
||
self.text = text
|
||
self.id = id
|
||
self.depth = depth
|
||
self.htmlOffset = htmlOffset
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case level = "level"
|
||
case text = "text"
|
||
case id = "id"
|
||
case depth = "depth"
|
||
case htmlOffset = "html_offset"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for HeaderMetadata
|
||
internal extension HeaderMetadata {
|
||
init(_ rb: RustBridge.HeaderMetadataRef) throws {
|
||
self.level = rb.level()
|
||
self.text = rb.text().toString()
|
||
self.id = rb.id()?.toString()
|
||
self.depth = rb.depth()
|
||
self.htmlOffset = rb.htmlOffset()
|
||
}
|
||
func intoRust() throws -> RustBridge.HeaderMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.headerMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Link element metadata.
|
||
public struct LinkMetadata: Codable, Sendable, Hashable {
|
||
/// The href URL value
|
||
public let href: String
|
||
/// Link text content (normalized)
|
||
public let text: String
|
||
/// Optional title attribute
|
||
public let title: String?
|
||
/// Link type classification
|
||
public let linkType: LinkType
|
||
/// Rel attribute values
|
||
public let rel: [String]
|
||
/// Additional attributes as key-value pairs
|
||
public let attributes: [[String]]
|
||
public init(href: String, text: String, title: String? = nil, linkType: LinkType, rel: [String], attributes: [[String]]) {
|
||
self.href = href
|
||
self.text = text
|
||
self.title = title
|
||
self.linkType = linkType
|
||
self.rel = rel
|
||
self.attributes = attributes
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case href = "href"
|
||
case text = "text"
|
||
case title = "title"
|
||
case linkType = "link_type"
|
||
case rel = "rel"
|
||
case attributes = "attributes"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LinkMetadata
|
||
internal extension LinkMetadata {
|
||
init(_ rb: RustBridge.LinkMetadataRef) throws {
|
||
self.href = rb.href().toString()
|
||
self.text = rb.text().toString()
|
||
self.title = rb.title()?.toString()
|
||
self.linkType = LinkType(rawValue: rb.linkType().toString()) ?? { fatalError("Unknown LinkType: \(rb.linkType().toString())") }()
|
||
self.rel = rb.rel().map { $0.as_str().toString() }
|
||
self.attributes = try JSONDecoder().decode([[String]].self, from: Data("null".utf8))
|
||
}
|
||
func intoRust() throws -> RustBridge.LinkMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.linkMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Image element metadata.
|
||
public struct ImageMetadataType: Codable, Sendable, Hashable {
|
||
/// Image source (URL, data URI, or SVG content)
|
||
public let src: String
|
||
/// Alternative text from alt attribute
|
||
public let alt: String?
|
||
/// Title attribute
|
||
public let title: String?
|
||
/// Image dimensions as (width, height) if available
|
||
public let dimensions: [UInt32]?
|
||
/// Image type classification
|
||
public let imageType: ImageType
|
||
/// Additional attributes as key-value pairs
|
||
public let attributes: [[String]]
|
||
public init(src: String, alt: String? = nil, title: String? = nil, dimensions: [UInt32]? = nil, imageType: ImageType, attributes: [[String]]) {
|
||
self.src = src
|
||
self.alt = alt
|
||
self.title = title
|
||
self.dimensions = dimensions
|
||
self.imageType = imageType
|
||
self.attributes = attributes
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case src = "src"
|
||
case alt = "alt"
|
||
case title = "title"
|
||
case dimensions = "dimensions"
|
||
case imageType = "image_type"
|
||
case attributes = "attributes"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ImageMetadataType
|
||
internal extension ImageMetadataType {
|
||
init(_ rb: RustBridge.ImageMetadataTypeRef) throws {
|
||
self.src = rb.src().toString()
|
||
self.alt = rb.alt()?.toString()
|
||
self.title = rb.title()?.toString()
|
||
self.dimensions = rb.dimensions().map { Array($0) }
|
||
self.imageType = ImageType(rawValue: rb.imageType().toString()) ?? { fatalError("Unknown ImageType: \(rb.imageType().toString())") }()
|
||
self.attributes = try JSONDecoder().decode([[String]].self, from: Data("null".utf8))
|
||
}
|
||
func intoRust() throws -> RustBridge.ImageMetadataType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.imageMetadataTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Structured data (Schema.org, microdata, RDFa) block.
|
||
public struct StructuredData: Codable, Sendable, Hashable {
|
||
/// Type of structured data
|
||
public let dataType: StructuredDataType
|
||
/// Raw JSON string representation
|
||
public let rawJson: String
|
||
/// Schema type if detectable (e.g., "Article", "Event", "Product")
|
||
public let schemaType: String?
|
||
public init(dataType: StructuredDataType, rawJson: String, schemaType: String? = nil) {
|
||
self.dataType = dataType
|
||
self.rawJson = rawJson
|
||
self.schemaType = schemaType
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case dataType = "data_type"
|
||
case rawJson = "raw_json"
|
||
case schemaType = "schema_type"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for StructuredData
|
||
internal extension StructuredData {
|
||
init(_ rb: RustBridge.StructuredDataRef) throws {
|
||
self.dataType = StructuredDataType(rawValue: rb.dataType().toString()) ?? { fatalError("Unknown StructuredDataType: \(rb.dataType().toString())") }()
|
||
self.rawJson = rb.rawJson().toString()
|
||
self.schemaType = rb.schemaType()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.StructuredData {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.structuredDataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// HTML metadata extracted from HTML documents.
|
||
///
|
||
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
||
/// and extracted structural elements (headers, links, images, structured data).
|
||
public typealias HtmlMetadata = RustBridge.HtmlMetadata
|
||
|
||
/// OCR processing metadata.
|
||
///
|
||
/// Captures information about OCR processing configuration and results.
|
||
public struct OcrMetadata: Codable, Sendable, Hashable {
|
||
/// OCR language code(s) used
|
||
public let language: String
|
||
/// Tesseract Page Segmentation Mode (PSM)
|
||
public let psm: Int32
|
||
/// Output format (e.g., "text", "hocr")
|
||
public let outputFormat: String
|
||
/// Number of tables detected
|
||
public let tableCount: UInt32
|
||
public let tableRows: UInt32?
|
||
public let tableCols: UInt32?
|
||
public init(language: String, psm: Int32, outputFormat: String, tableCount: UInt32, tableRows: UInt32? = nil, tableCols: UInt32? = nil) {
|
||
self.language = language
|
||
self.psm = psm
|
||
self.outputFormat = outputFormat
|
||
self.tableCount = tableCount
|
||
self.tableRows = tableRows
|
||
self.tableCols = tableCols
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case language = "language"
|
||
case psm = "psm"
|
||
case outputFormat = "output_format"
|
||
case tableCount = "table_count"
|
||
case tableRows = "table_rows"
|
||
case tableCols = "table_cols"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? ""
|
||
self.psm = try container.decodeIfPresent(Int32.self, forKey: .psm) ?? 0
|
||
self.outputFormat = try container.decodeIfPresent(String.self, forKey: .outputFormat) ?? ""
|
||
self.tableCount = try container.decodeIfPresent(UInt32.self, forKey: .tableCount) ?? 0
|
||
self.tableRows = try container.decodeIfPresent(UInt32.self, forKey: .tableRows) ?? nil
|
||
self.tableCols = try container.decodeIfPresent(UInt32.self, forKey: .tableCols) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrMetadata
|
||
internal extension OcrMetadata {
|
||
init(_ rb: RustBridge.OcrMetadataRef) throws {
|
||
self.language = rb.language().toString()
|
||
self.psm = rb.psm()
|
||
self.outputFormat = rb.outputFormat().toString()
|
||
self.tableCount = rb.tableCount()
|
||
self.tableRows = rb.tableRows()
|
||
self.tableCols = rb.tableCols()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrMetadata {
|
||
return RustBridge.OcrMetadata(RustString(self.language), self.psm, RustString(self.outputFormat), self.tableCount, self.tableRows, self.tableCols)
|
||
}
|
||
}
|
||
|
||
/// Error metadata (for batch operations).
|
||
public struct ErrorMetadata: Codable, Sendable, Hashable {
|
||
public let errorType: String
|
||
public let message: String
|
||
public init(errorType: String, message: String) {
|
||
self.errorType = errorType
|
||
self.message = message
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case errorType = "error_type"
|
||
case message = "message"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ErrorMetadata
|
||
internal extension ErrorMetadata {
|
||
init(_ rb: RustBridge.ErrorMetadataRef) throws {
|
||
self.errorType = rb.errorType().toString()
|
||
self.message = rb.message().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.ErrorMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.errorMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// PowerPoint presentation metadata.
|
||
///
|
||
/// Extracted from PPTX files containing slide counts and presentation details.
|
||
public struct PptxMetadata: Codable, Sendable, Hashable {
|
||
/// Total number of slides in the presentation
|
||
public let slideCount: UInt32
|
||
/// Names of slides (if available)
|
||
public let slideNames: [String]
|
||
/// Number of embedded images
|
||
public let imageCount: UInt32?
|
||
/// Number of tables
|
||
public let tableCount: UInt32?
|
||
public init(slideCount: UInt32, slideNames: [String], imageCount: UInt32? = nil, tableCount: UInt32? = nil) {
|
||
self.slideCount = slideCount
|
||
self.slideNames = slideNames
|
||
self.imageCount = imageCount
|
||
self.tableCount = tableCount
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case slideCount = "slide_count"
|
||
case slideNames = "slide_names"
|
||
case imageCount = "image_count"
|
||
case tableCount = "table_count"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.slideCount = try container.decodeIfPresent(UInt32.self, forKey: .slideCount) ?? 0
|
||
self.slideNames = try container.decodeIfPresent([String].self, forKey: .slideNames) ?? []
|
||
self.imageCount = try container.decodeIfPresent(UInt32.self, forKey: .imageCount) ?? nil
|
||
self.tableCount = try container.decodeIfPresent(UInt32.self, forKey: .tableCount) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PptxMetadata
|
||
internal extension PptxMetadata {
|
||
init(_ rb: RustBridge.PptxMetadataRef) throws {
|
||
self.slideCount = rb.slideCount()
|
||
self.slideNames = rb.slideNames().map { $0.as_str().toString() }
|
||
self.imageCount = rb.imageCount()
|
||
self.tableCount = rb.tableCount()
|
||
}
|
||
func intoRust() throws -> RustBridge.PptxMetadata {
|
||
let __slideNames = RustVec<RustString>()
|
||
for __elem in self.slideNames { __slideNames.push(value: RustString(__elem)) }
|
||
return RustBridge.PptxMetadata(self.slideCount, __slideNames, self.imageCount, self.tableCount)
|
||
}
|
||
}
|
||
|
||
/// Word document metadata.
|
||
///
|
||
/// Extracted from DOCX files using shared Office Open XML metadata extraction.
|
||
/// Integrates with `office_metadata` module for core/app/custom properties.
|
||
public typealias DocxMetadata = RustBridge.DocxMetadata
|
||
|
||
/// CSV/TSV file metadata.
|
||
public struct CsvMetadata: Codable, Sendable, Hashable {
|
||
public let rowCount: UInt32
|
||
public let columnCount: UInt32
|
||
public let delimiter: String?
|
||
public let hasHeader: Bool
|
||
public let columnTypes: [String]?
|
||
public init(rowCount: UInt32, columnCount: UInt32, delimiter: String? = nil, hasHeader: Bool, columnTypes: [String]? = nil) {
|
||
self.rowCount = rowCount
|
||
self.columnCount = columnCount
|
||
self.delimiter = delimiter
|
||
self.hasHeader = hasHeader
|
||
self.columnTypes = columnTypes
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case rowCount = "row_count"
|
||
case columnCount = "column_count"
|
||
case delimiter = "delimiter"
|
||
case hasHeader = "has_header"
|
||
case columnTypes = "column_types"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.rowCount = try container.decodeIfPresent(UInt32.self, forKey: .rowCount) ?? 0
|
||
self.columnCount = try container.decodeIfPresent(UInt32.self, forKey: .columnCount) ?? 0
|
||
self.delimiter = try container.decodeIfPresent(String.self, forKey: .delimiter) ?? nil
|
||
self.hasHeader = try container.decodeIfPresent(Bool.self, forKey: .hasHeader) ?? false
|
||
self.columnTypes = try container.decodeIfPresent([String].self, forKey: .columnTypes) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for CsvMetadata
|
||
internal extension CsvMetadata {
|
||
init(_ rb: RustBridge.CsvMetadataRef) throws {
|
||
self.rowCount = rb.rowCount()
|
||
self.columnCount = rb.columnCount()
|
||
self.delimiter = rb.delimiter()?.toString()
|
||
self.hasHeader = rb.hasHeader()
|
||
self.columnTypes = rb.columnTypes()?.map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.CsvMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.csvMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// BibTeX bibliography metadata.
|
||
public typealias BibtexMetadata = RustBridge.BibtexMetadata
|
||
|
||
/// Citation file metadata (RIS, PubMed, EndNote).
|
||
public struct CitationMetadata: Codable, Sendable, Hashable {
|
||
public let citationCount: UInt
|
||
public let format: String?
|
||
public let authors: [String]
|
||
public let yearRange: YearRange?
|
||
public let dois: [String]
|
||
public let keywords: [String]
|
||
public init(citationCount: UInt, format: String? = nil, authors: [String], yearRange: YearRange? = nil, dois: [String], keywords: [String]) {
|
||
self.citationCount = citationCount
|
||
self.format = format
|
||
self.authors = authors
|
||
self.yearRange = yearRange
|
||
self.dois = dois
|
||
self.keywords = keywords
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case citationCount = "citation_count"
|
||
case format = "format"
|
||
case authors = "authors"
|
||
case yearRange = "year_range"
|
||
case dois = "dois"
|
||
case keywords = "keywords"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.citationCount = try container.decodeIfPresent(UInt.self, forKey: .citationCount) ?? 0
|
||
self.format = try container.decodeIfPresent(String.self, forKey: .format) ?? nil
|
||
self.authors = try container.decodeIfPresent([String].self, forKey: .authors) ?? []
|
||
self.yearRange = try container.decodeIfPresent(YearRange.self, forKey: .yearRange) ?? nil
|
||
self.dois = try container.decodeIfPresent([String].self, forKey: .dois) ?? []
|
||
self.keywords = try container.decodeIfPresent([String].self, forKey: .keywords) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for CitationMetadata
|
||
internal extension CitationMetadata {
|
||
init(_ rb: RustBridge.CitationMetadataRef) throws {
|
||
self.citationCount = rb.citationCount()
|
||
self.format = rb.format()?.toString()
|
||
self.authors = rb.authors().map { $0.as_str().toString() }
|
||
self.yearRange = try rb.yearRange().map { try YearRange($0) }
|
||
self.dois = rb.dois().map { $0.as_str().toString() }
|
||
self.keywords = rb.keywords().map { $0.as_str().toString() }
|
||
}
|
||
func intoRust() throws -> RustBridge.CitationMetadata {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.citationMetadataFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Year range for bibliographic metadata.
|
||
public struct YearRange: Codable, Sendable, Hashable {
|
||
public let min: UInt32?
|
||
public let max: UInt32?
|
||
public let years: [UInt32]
|
||
public init(min: UInt32? = nil, max: UInt32? = nil, years: [UInt32]) {
|
||
self.min = min
|
||
self.max = max
|
||
self.years = years
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for YearRange
|
||
internal extension YearRange {
|
||
init(_ rb: RustBridge.YearRangeRef) throws {
|
||
self.min = rb.min()
|
||
self.max = rb.max()
|
||
self.years = Array(rb.years())
|
||
}
|
||
func intoRust() throws -> RustBridge.YearRange {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.yearRangeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// FictionBook (FB2) metadata.
|
||
public struct FictionBookMetadata: Codable, Sendable, Hashable {
|
||
public let genres: [String]
|
||
public let sequences: [String]
|
||
public let annotation: String?
|
||
public init(genres: [String], sequences: [String], annotation: String? = nil) {
|
||
self.genres = genres
|
||
self.sequences = sequences
|
||
self.annotation = annotation
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case genres = "genres"
|
||
case sequences = "sequences"
|
||
case annotation = "annotation"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.genres = try container.decodeIfPresent([String].self, forKey: .genres) ?? []
|
||
self.sequences = try container.decodeIfPresent([String].self, forKey: .sequences) ?? []
|
||
self.annotation = try container.decodeIfPresent(String.self, forKey: .annotation) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for FictionBookMetadata
|
||
internal extension FictionBookMetadata {
|
||
init(_ rb: RustBridge.FictionBookMetadataRef) throws {
|
||
self.genres = rb.genres().map { $0.as_str().toString() }
|
||
self.sequences = rb.sequences().map { $0.as_str().toString() }
|
||
self.annotation = rb.annotation()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.FictionBookMetadata {
|
||
let __genres = RustVec<RustString>()
|
||
for __elem in self.genres { __genres.push(value: RustString(__elem)) }
|
||
let __sequences = RustVec<RustString>()
|
||
for __elem in self.sequences { __sequences.push(value: RustString(__elem)) }
|
||
return RustBridge.FictionBookMetadata(__genres, __sequences, self.annotation.map(RustString.init))
|
||
}
|
||
}
|
||
|
||
/// dBASE (DBF) file metadata.
|
||
public struct DbfMetadata: Codable, Sendable, Hashable {
|
||
public let recordCount: UInt
|
||
public let fieldCount: UInt
|
||
public let fields: [DbfFieldInfo]
|
||
public init(recordCount: UInt, fieldCount: UInt, fields: [DbfFieldInfo]) {
|
||
self.recordCount = recordCount
|
||
self.fieldCount = fieldCount
|
||
self.fields = fields
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case recordCount = "record_count"
|
||
case fieldCount = "field_count"
|
||
case fields = "fields"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.recordCount = try container.decodeIfPresent(UInt.self, forKey: .recordCount) ?? 0
|
||
self.fieldCount = try container.decodeIfPresent(UInt.self, forKey: .fieldCount) ?? 0
|
||
self.fields = try container.decodeIfPresent([DbfFieldInfo].self, forKey: .fields) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DbfMetadata
|
||
internal extension DbfMetadata {
|
||
init(_ rb: RustBridge.DbfMetadataRef) throws {
|
||
self.recordCount = rb.recordCount()
|
||
self.fieldCount = rb.fieldCount()
|
||
self.fields = try rb.fields().map { try DbfFieldInfo($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.DbfMetadata {
|
||
let __fields = RustVec<RustBridge.DbfFieldInfo>()
|
||
for __elem in self.fields { __fields.push(value: try __elem.intoRust()) }
|
||
return RustBridge.DbfMetadata(self.recordCount, self.fieldCount, __fields)
|
||
}
|
||
}
|
||
|
||
/// dBASE field information.
|
||
public struct DbfFieldInfo: Codable, Sendable, Hashable {
|
||
public let name: String
|
||
public let fieldType: String
|
||
public init(name: String, fieldType: String) {
|
||
self.name = name
|
||
self.fieldType = fieldType
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case name = "name"
|
||
case fieldType = "field_type"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DbfFieldInfo
|
||
internal extension DbfFieldInfo {
|
||
init(_ rb: RustBridge.DbfFieldInfoRef) throws {
|
||
self.name = rb.name().toString()
|
||
self.fieldType = rb.fieldType().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.DbfFieldInfo {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.dbfFieldInfoFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// JATS (Journal Article Tag Suite) metadata.
|
||
public typealias JatsMetadata = RustBridge.JatsMetadata
|
||
|
||
/// JATS contributor with role.
|
||
public struct ContributorRole: Codable, Sendable, Hashable {
|
||
public let name: String
|
||
public let role: String?
|
||
public init(name: String, role: String? = nil) {
|
||
self.name = name
|
||
self.role = role
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ContributorRole
|
||
internal extension ContributorRole {
|
||
init(_ rb: RustBridge.ContributorRoleRef) throws {
|
||
self.name = rb.name().toString()
|
||
self.role = rb.role()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.ContributorRole {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.contributorRoleFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// EPUB metadata (Dublin Core extensions).
|
||
public struct EpubMetadata: Codable, Sendable, Hashable {
|
||
public let coverage: String?
|
||
public let dcFormat: String?
|
||
public let relation: String?
|
||
public let source: String?
|
||
public let dcType: String?
|
||
public let coverImage: String?
|
||
public init(coverage: String? = nil, dcFormat: String? = nil, relation: String? = nil, source: String? = nil, dcType: String? = nil, coverImage: String? = nil) {
|
||
self.coverage = coverage
|
||
self.dcFormat = dcFormat
|
||
self.relation = relation
|
||
self.source = source
|
||
self.dcType = dcType
|
||
self.coverImage = coverImage
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case coverage = "coverage"
|
||
case dcFormat = "dc_format"
|
||
case relation = "relation"
|
||
case source = "source"
|
||
case dcType = "dc_type"
|
||
case coverImage = "cover_image"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.coverage = try container.decodeIfPresent(String.self, forKey: .coverage) ?? nil
|
||
self.dcFormat = try container.decodeIfPresent(String.self, forKey: .dcFormat) ?? nil
|
||
self.relation = try container.decodeIfPresent(String.self, forKey: .relation) ?? nil
|
||
self.source = try container.decodeIfPresent(String.self, forKey: .source) ?? nil
|
||
self.dcType = try container.decodeIfPresent(String.self, forKey: .dcType) ?? nil
|
||
self.coverImage = try container.decodeIfPresent(String.self, forKey: .coverImage) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for EpubMetadata
|
||
internal extension EpubMetadata {
|
||
init(_ rb: RustBridge.EpubMetadataRef) throws {
|
||
self.coverage = rb.coverage()?.toString()
|
||
self.dcFormat = rb.dcFormat()?.toString()
|
||
self.relation = rb.relation()?.toString()
|
||
self.source = rb.source()?.toString()
|
||
self.dcType = rb.dcType()?.toString()
|
||
self.coverImage = rb.coverImage()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.EpubMetadata {
|
||
return RustBridge.EpubMetadata(self.coverage.map(RustString.init), self.dcFormat.map(RustString.init), self.relation.map(RustString.init), self.source.map(RustString.init), self.dcType.map(RustString.init), self.coverImage.map(RustString.init))
|
||
}
|
||
}
|
||
|
||
/// Outlook PST archive metadata.
|
||
public struct PstMetadata: Codable, Sendable, Hashable {
|
||
public let messageCount: UInt
|
||
public init(messageCount: UInt) {
|
||
self.messageCount = messageCount
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case messageCount = "message_count"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.messageCount = try container.decodeIfPresent(UInt.self, forKey: .messageCount) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PstMetadata
|
||
internal extension PstMetadata {
|
||
init(_ rb: RustBridge.PstMetadataRef) throws {
|
||
self.messageCount = rb.messageCount()
|
||
}
|
||
func intoRust() throws -> RustBridge.PstMetadata {
|
||
return RustBridge.PstMetadata(self.messageCount)
|
||
}
|
||
}
|
||
|
||
/// Confidence scores for an OCR element.
|
||
///
|
||
/// Separates detection confidence (how confident that text exists at this location)
|
||
/// from recognition confidence (how confident about the actual text content).
|
||
public struct OcrConfidence: Codable, Sendable, Hashable {
|
||
/// Detection confidence: how confident the OCR engine is that text exists here.
|
||
///
|
||
/// PaddleOCR provides this as `box_score`, Tesseract doesn't have a direct equivalent.
|
||
/// Range: 0.0 to 1.0 (or None if not available).
|
||
public let detection: Double?
|
||
/// Recognition confidence: how confident about the text content.
|
||
///
|
||
/// Range: 0.0 to 1.0.
|
||
public let recognition: Double
|
||
public init(detection: Double? = nil, recognition: Double) {
|
||
self.detection = detection
|
||
self.recognition = recognition
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case detection = "detection"
|
||
case recognition = "recognition"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.detection = try container.decodeIfPresent(Double.self, forKey: .detection) ?? nil
|
||
self.recognition = try container.decodeIfPresent(Double.self, forKey: .recognition) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrConfidence
|
||
internal extension OcrConfidence {
|
||
init(_ rb: RustBridge.OcrConfidenceRef) throws {
|
||
self.detection = rb.detection()
|
||
self.recognition = rb.recognition()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrConfidence {
|
||
return RustBridge.OcrConfidence(self.detection, self.recognition)
|
||
}
|
||
}
|
||
|
||
/// Rotation information for an OCR element.
|
||
public struct OcrRotation: Codable, Sendable, Hashable {
|
||
/// Rotation angle in degrees (0, 90, 180, 270 for PaddleOCR).
|
||
public let angleDegrees: Double
|
||
/// Confidence score for the rotation detection.
|
||
public let confidence: Double?
|
||
public init(angleDegrees: Double, confidence: Double? = nil) {
|
||
self.angleDegrees = angleDegrees
|
||
self.confidence = confidence
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case angleDegrees = "angle_degrees"
|
||
case confidence = "confidence"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrRotation
|
||
internal extension OcrRotation {
|
||
init(_ rb: RustBridge.OcrRotationRef) throws {
|
||
self.angleDegrees = rb.angleDegrees()
|
||
self.confidence = rb.confidence()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrRotation {
|
||
return RustBridge.OcrRotation(self.angleDegrees, self.confidence)
|
||
}
|
||
}
|
||
|
||
/// A unified OCR element representing detected text with full metadata.
|
||
///
|
||
/// This is the primary type for structured OCR output, preserving all information
|
||
/// from both Tesseract and PaddleOCR backends.
|
||
public typealias OcrElement = RustBridge.OcrElement
|
||
|
||
/// Configuration for OCR element extraction.
|
||
///
|
||
/// Controls how OCR elements are extracted and filtered.
|
||
public struct OcrElementConfig: Codable, Sendable, Hashable {
|
||
/// Whether to include OCR elements in the extraction result.
|
||
///
|
||
/// When true, the `ocr_elements` field in `ExtractionResult` will be populated.
|
||
public let includeElements: Bool
|
||
/// Minimum hierarchical level to include.
|
||
///
|
||
/// Elements below this level (e.g., words when min_level is Line) will be excluded.
|
||
public let minLevel: OcrElementLevel
|
||
/// Minimum recognition confidence threshold (0.0-1.0).
|
||
///
|
||
/// Elements with confidence below this threshold will be filtered out.
|
||
public let minConfidence: Double
|
||
/// Whether to build hierarchical relationships between elements.
|
||
///
|
||
/// When true, `parent_id` fields will be populated based on spatial containment.
|
||
/// Only meaningful for Tesseract output.
|
||
public let buildHierarchy: Bool
|
||
public init(includeElements: Bool, minLevel: OcrElementLevel, minConfidence: Double, buildHierarchy: Bool) {
|
||
self.includeElements = includeElements
|
||
self.minLevel = minLevel
|
||
self.minConfidence = minConfidence
|
||
self.buildHierarchy = buildHierarchy
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case includeElements = "include_elements"
|
||
case minLevel = "min_level"
|
||
case minConfidence = "min_confidence"
|
||
case buildHierarchy = "build_hierarchy"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.includeElements = try container.decodeIfPresent(Bool.self, forKey: .includeElements) ?? false
|
||
self.minLevel = try container.decode(OcrElementLevel.self, forKey: .minLevel)
|
||
self.minConfidence = try container.decodeIfPresent(Double.self, forKey: .minConfidence) ?? 0
|
||
self.buildHierarchy = try container.decodeIfPresent(Bool.self, forKey: .buildHierarchy) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OcrElementConfig
|
||
internal extension OcrElementConfig {
|
||
init(_ rb: RustBridge.OcrElementConfigRef) throws {
|
||
self.includeElements = rb.includeElements()
|
||
self.minLevel = OcrElementLevel(rawValue: rb.minLevel().toString()) ?? { fatalError("Unknown OcrElementLevel: \(rb.minLevel().toString())") }()
|
||
self.minConfidence = rb.minConfidence()
|
||
self.buildHierarchy = rb.buildHierarchy()
|
||
}
|
||
func intoRust() throws -> RustBridge.OcrElementConfig {
|
||
return RustBridge.OcrElementConfig(self.includeElements, try self.minLevel.intoRust(), self.minConfidence, self.buildHierarchy)
|
||
}
|
||
}
|
||
|
||
/// Unified page structure for documents.
|
||
///
|
||
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
||
/// with character offset boundaries for chunk-to-page mapping.
|
||
public struct PageStructure: Codable, Sendable, Hashable {
|
||
/// Total number of pages/slides/sheets
|
||
public let totalCount: UInt32
|
||
/// Type of paginated unit
|
||
public let unitType: PageUnitType
|
||
/// Character offset boundaries for each page
|
||
///
|
||
/// Maps character ranges in the extracted content to page numbers.
|
||
/// Used for chunk page range calculation.
|
||
public let boundaries: [PageBoundary]?
|
||
/// Detailed per-page metadata (optional, only when needed)
|
||
public let pages: [PageInfo]?
|
||
public init(totalCount: UInt32, unitType: PageUnitType, boundaries: [PageBoundary]? = nil, pages: [PageInfo]? = nil) {
|
||
self.totalCount = totalCount
|
||
self.unitType = unitType
|
||
self.boundaries = boundaries
|
||
self.pages = pages
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case totalCount = "total_count"
|
||
case unitType = "unit_type"
|
||
case boundaries = "boundaries"
|
||
case pages = "pages"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageStructure
|
||
internal extension PageStructure {
|
||
init(_ rb: RustBridge.PageStructureRef) throws {
|
||
self.totalCount = rb.totalCount()
|
||
self.unitType = PageUnitType(rawValue: rb.unitType().toString()) ?? { fatalError("Unknown PageUnitType: \(rb.unitType().toString())") }()
|
||
self.boundaries = try rb.boundaries()?.map { try PageBoundary($0) }
|
||
self.pages = try rb.pages()?.map { try PageInfo($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.PageStructure {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pageStructureFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Byte offset boundary for a page.
|
||
///
|
||
/// Tracks where a specific page's content starts and ends in the main content string,
|
||
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
||
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
||
public struct PageBoundary: Codable, Sendable, Hashable {
|
||
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
||
public let byteStart: UInt
|
||
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
||
public let byteEnd: UInt
|
||
/// Page number (1-indexed)
|
||
public let pageNumber: UInt32
|
||
public init(byteStart: UInt, byteEnd: UInt, pageNumber: UInt32) {
|
||
self.byteStart = byteStart
|
||
self.byteEnd = byteEnd
|
||
self.pageNumber = pageNumber
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case byteStart = "byte_start"
|
||
case byteEnd = "byte_end"
|
||
case pageNumber = "page_number"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageBoundary
|
||
internal extension PageBoundary {
|
||
init(_ rb: RustBridge.PageBoundaryRef) throws {
|
||
self.byteStart = rb.byteStart()
|
||
self.byteEnd = rb.byteEnd()
|
||
self.pageNumber = rb.pageNumber()
|
||
}
|
||
func intoRust() throws -> RustBridge.PageBoundary {
|
||
return RustBridge.PageBoundary(self.byteStart, self.byteEnd, self.pageNumber)
|
||
}
|
||
}
|
||
|
||
/// Metadata for individual page/slide/sheet.
|
||
///
|
||
/// Captures per-page information including dimensions, content counts,
|
||
/// and visibility state (for presentations).
|
||
public struct PageInfo: Codable, Sendable, Hashable {
|
||
/// Page number (1-indexed)
|
||
public let number: UInt32
|
||
/// Page title (usually for presentations)
|
||
public let title: String?
|
||
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
||
public let dimensions: [Double]?
|
||
/// Number of images on this page
|
||
public let imageCount: UInt32?
|
||
/// Number of tables on this page
|
||
public let tableCount: UInt32?
|
||
/// Whether this page is hidden (e.g., in presentations)
|
||
public let hidden: Bool?
|
||
/// Whether this page is blank (no meaningful text, no images, no tables)
|
||
///
|
||
/// A page is considered blank if it has fewer than 3 non-whitespace characters
|
||
/// and contains no tables or images. This is useful for filtering out empty pages
|
||
/// in scanned documents or PDFs with blank separator pages.
|
||
public let isBlank: Bool?
|
||
/// Whether this page contains non-trivial vector graphics (paths, shapes, curves)
|
||
///
|
||
/// Indicates the presence of vector-drawn content such as charts, diagrams,
|
||
/// or geometric shapes (e.g., from Adobe InDesign, LaTeX TikZ). These are
|
||
/// invisible to `ExtractionResult.images` since they are not embedded as raster
|
||
/// XObjects. Set to `true` when path count exceeds a heuristic threshold,
|
||
/// signaling that downstream consumers may want to rasterize the page to
|
||
/// capture this content.
|
||
///
|
||
/// Only populated for PDFs; `None` for other document types.
|
||
public let hasVectorGraphics: Bool
|
||
public init(number: UInt32, title: String? = nil, dimensions: [Double]? = nil, imageCount: UInt32? = nil, tableCount: UInt32? = nil, hidden: Bool? = nil, isBlank: Bool? = nil, hasVectorGraphics: Bool) {
|
||
self.number = number
|
||
self.title = title
|
||
self.dimensions = dimensions
|
||
self.imageCount = imageCount
|
||
self.tableCount = tableCount
|
||
self.hidden = hidden
|
||
self.isBlank = isBlank
|
||
self.hasVectorGraphics = hasVectorGraphics
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case number = "number"
|
||
case title = "title"
|
||
case dimensions = "dimensions"
|
||
case imageCount = "image_count"
|
||
case tableCount = "table_count"
|
||
case hidden = "hidden"
|
||
case isBlank = "is_blank"
|
||
case hasVectorGraphics = "has_vector_graphics"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageInfo
|
||
internal extension PageInfo {
|
||
init(_ rb: RustBridge.PageInfoRef) throws {
|
||
self.number = rb.number()
|
||
self.title = rb.title()?.toString()
|
||
self.dimensions = rb.dimensions().map { Array($0) }
|
||
self.imageCount = rb.imageCount()
|
||
self.tableCount = rb.tableCount()
|
||
self.hidden = rb.hidden()
|
||
self.isBlank = rb.isBlank()
|
||
self.hasVectorGraphics = rb.hasVectorGraphics()
|
||
}
|
||
func intoRust() throws -> RustBridge.PageInfo {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pageInfoFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Content for a single page/slide.
|
||
///
|
||
/// When page extraction is enabled, documents are split into per-page content
|
||
/// with associated tables and images mapped to each page.
|
||
///
|
||
/// # Performance
|
||
///
|
||
/// Uses Arc-wrapped tables and images for memory efficiency:
|
||
/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
||
/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
||
/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
||
///
|
||
/// This reduces memory overhead for documents with shared tables/images
|
||
/// by avoiding redundant copies during serialization.
|
||
public struct PageContent: Codable, Sendable, Hashable {
|
||
/// Page number (1-indexed)
|
||
public let pageNumber: UInt32
|
||
/// Text content for this page
|
||
public let content: String
|
||
/// Tables found on this page (uses Arc for memory efficiency)
|
||
///
|
||
/// Serializes as Vec<Table> for JSON compatibility while maintaining
|
||
/// Arc semantics in-memory for zero-copy sharing.
|
||
public let tables: [Table]
|
||
/// Indices into `ExtractionResult.images` for images found on this page.
|
||
///
|
||
/// Each value is a zero-based index into the top-level `images` collection.
|
||
/// Only populated when `extract_images = true` in the extraction config.
|
||
public let imageIndices: [UInt32]
|
||
/// Hierarchy information for the page (when hierarchy extraction is enabled)
|
||
///
|
||
/// Contains text hierarchy levels (H1-H6) extracted from the page content.
|
||
public let hierarchy: PageHierarchy?
|
||
/// Whether this page is blank (no meaningful text content)
|
||
///
|
||
/// Determined during extraction based on text content analysis.
|
||
/// A page is blank if it has fewer than 3 non-whitespace characters
|
||
/// and contains no tables or images.
|
||
public let isBlank: Bool?
|
||
/// Layout detection regions for this page (when layout detection is enabled).
|
||
///
|
||
/// Contains detected layout regions with class, confidence, bounding box,
|
||
/// and area fraction. Only populated when layout detection is configured.
|
||
public let layoutRegions: [LayoutRegion]?
|
||
/// Speaker notes for this slide (PPTX only).
|
||
///
|
||
/// Contains the text from the slide's notes pane (`ppt/notesSlides/notesSlide{N}.xml`).
|
||
/// Only populated when the source is a PPTX file and notes are present.
|
||
public let speakerNotes: String?
|
||
/// Section name this slide belongs to (PPTX only).
|
||
///
|
||
/// PowerPoint sections group slides into logical chapters (`<p:sectionLst>` in
|
||
/// `ppt/presentation.xml`). Only populated when the source is a PPTX file and
|
||
/// the slide belongs to a named section.
|
||
public let sectionName: String?
|
||
/// Sheet name for this page (XLSX/ODS only).
|
||
///
|
||
/// Each spreadsheet sheet maps to one `PageContent` entry. This field carries the
|
||
/// sheet's display name as it appears in the workbook. `None` for all non-spreadsheet
|
||
/// formats and for sheets with an empty name.
|
||
public let sheetName: String?
|
||
public init(pageNumber: UInt32, content: String, tables: [Table], imageIndices: [UInt32], hierarchy: PageHierarchy? = nil, isBlank: Bool? = nil, layoutRegions: [LayoutRegion]? = nil, speakerNotes: String? = nil, sectionName: String? = nil, sheetName: String? = nil) {
|
||
self.pageNumber = pageNumber
|
||
self.content = content
|
||
self.tables = tables
|
||
self.imageIndices = imageIndices
|
||
self.hierarchy = hierarchy
|
||
self.isBlank = isBlank
|
||
self.layoutRegions = layoutRegions
|
||
self.speakerNotes = speakerNotes
|
||
self.sectionName = sectionName
|
||
self.sheetName = sheetName
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case pageNumber = "page_number"
|
||
case content = "content"
|
||
case tables = "tables"
|
||
case imageIndices = "image_indices"
|
||
case hierarchy = "hierarchy"
|
||
case isBlank = "is_blank"
|
||
case layoutRegions = "layout_regions"
|
||
case speakerNotes = "speaker_notes"
|
||
case sectionName = "section_name"
|
||
case sheetName = "sheet_name"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageContent
|
||
internal extension PageContent {
|
||
init(_ rb: RustBridge.PageContentRef) throws {
|
||
self.pageNumber = rb.pageNumber()
|
||
self.content = rb.content().toString()
|
||
self.tables = try rb.tables().map { try Table($0) }
|
||
self.imageIndices = Array(rb.imageIndices())
|
||
self.hierarchy = try rb.hierarchy().map { try PageHierarchy($0) }
|
||
self.isBlank = rb.isBlank()
|
||
self.layoutRegions = try rb.layoutRegions()?.map { try LayoutRegion($0) }
|
||
self.speakerNotes = rb.speakerNotes()?.toString()
|
||
self.sectionName = rb.sectionName()?.toString()
|
||
self.sheetName = rb.sheetName()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.PageContent {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pageContentFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A detected layout region on a page.
|
||
///
|
||
/// When layout detection is enabled, each page may have layout regions
|
||
/// identifying different content types (text, pictures, tables, etc.)
|
||
/// with confidence scores and spatial positions.
|
||
public struct LayoutRegion: Codable, Sendable, Hashable {
|
||
/// Layout class name (e.g. "picture", "table", "text", "section_header").
|
||
public let className: String
|
||
/// Confidence score from the layout detection model (0.0 to 1.0).
|
||
public let confidence: Double
|
||
/// Bounding box in document coordinate space.
|
||
public let boundingBox: BoundingBox
|
||
/// Fraction of the page area covered by this region (0.0 to 1.0).
|
||
public let areaFraction: Double
|
||
public init(className: String, confidence: Double, boundingBox: BoundingBox, areaFraction: Double) {
|
||
self.className = className
|
||
self.confidence = confidence
|
||
self.boundingBox = boundingBox
|
||
self.areaFraction = areaFraction
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case className = "class_name"
|
||
case confidence = "confidence"
|
||
case boundingBox = "bounding_box"
|
||
case areaFraction = "area_fraction"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.className = try container.decodeIfPresent(String.self, forKey: .className) ?? ""
|
||
self.confidence = try container.decodeIfPresent(Double.self, forKey: .confidence) ?? 0
|
||
self.boundingBox = try container.decode(BoundingBox.self, forKey: .boundingBox)
|
||
self.areaFraction = try container.decodeIfPresent(Double.self, forKey: .areaFraction) ?? 0
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LayoutRegion
|
||
internal extension LayoutRegion {
|
||
init(_ rb: RustBridge.LayoutRegionRef) throws {
|
||
self.className = rb.className().toString()
|
||
self.confidence = rb.confidence()
|
||
self.boundingBox = try BoundingBox(rb.boundingBox())
|
||
self.areaFraction = rb.areaFraction()
|
||
}
|
||
func intoRust() throws -> RustBridge.LayoutRegion {
|
||
return RustBridge.LayoutRegion(RustString(self.className), self.confidence, try self.boundingBox.intoRust(), self.areaFraction)
|
||
}
|
||
}
|
||
|
||
/// Page hierarchy structure containing heading levels and block information.
|
||
///
|
||
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
||
/// blocks with heading levels (H1-H6) for semantic document structure.
|
||
public struct PageHierarchy: Codable, Sendable, Hashable {
|
||
/// Number of hierarchy blocks on this page
|
||
public let blockCount: UInt32
|
||
/// Hierarchical blocks with heading levels
|
||
public let blocks: [HierarchicalBlock]
|
||
public init(blockCount: UInt32, blocks: [HierarchicalBlock]) {
|
||
self.blockCount = blockCount
|
||
self.blocks = blocks
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case blockCount = "block_count"
|
||
case blocks = "blocks"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PageHierarchy
|
||
internal extension PageHierarchy {
|
||
init(_ rb: RustBridge.PageHierarchyRef) throws {
|
||
self.blockCount = rb.blockCount()
|
||
self.blocks = try rb.blocks().map { try HierarchicalBlock($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.PageHierarchy {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.pageHierarchyFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A text block with hierarchy level assignment.
|
||
///
|
||
/// Represents a block of text with semantic heading information extracted from
|
||
/// font size clustering and hierarchical analysis.
|
||
public struct HierarchicalBlock: Codable, Sendable, Hashable {
|
||
/// The text content of this block
|
||
public let text: String
|
||
/// The font size of the text in this block
|
||
public let fontSize: Float
|
||
/// The hierarchy level of this block (H1-H6 or Body)
|
||
///
|
||
/// Levels correspond to HTML heading tags:
|
||
/// - "h1": Top-level heading
|
||
/// - "h2": Secondary heading
|
||
/// - "h3": Tertiary heading
|
||
/// - "h4": Quaternary heading
|
||
/// - "h5": Quinary heading
|
||
/// - "h6": Senary heading
|
||
/// - "body": Body text (no heading level)
|
||
public let level: String
|
||
/// Bounding box information for the block
|
||
///
|
||
/// Contains coordinates as (left, top, right, bottom) in PDF units.
|
||
public let bbox: [Float]?
|
||
public init(text: String, fontSize: Float, level: String, bbox: [Float]? = nil) {
|
||
self.text = text
|
||
self.fontSize = fontSize
|
||
self.level = level
|
||
self.bbox = bbox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case text = "text"
|
||
case fontSize = "font_size"
|
||
case level = "level"
|
||
case bbox = "bbox"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for HierarchicalBlock
|
||
internal extension HierarchicalBlock {
|
||
init(_ rb: RustBridge.HierarchicalBlockRef) throws {
|
||
self.text = rb.text().toString()
|
||
self.fontSize = rb.fontSize()
|
||
self.level = rb.level().toString()
|
||
self.bbox = rb.bbox().map { Array($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.HierarchicalBlock {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.hierarchicalBlockFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A single changed cell within a table.
|
||
///
|
||
/// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
/// reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
/// `crate::diff` re-exports this type verbatim.
|
||
public struct CellChange: Codable, Sendable, Hashable {
|
||
/// Zero-based row index.
|
||
public let row: UInt
|
||
/// Zero-based column index.
|
||
public let col: UInt
|
||
/// Value before the change.
|
||
public let from: String
|
||
/// Value after the change.
|
||
public let to: String
|
||
public init(row: UInt, col: UInt, from: String, to: String) {
|
||
self.row = row
|
||
self.col = col
|
||
self.from = from
|
||
self.to = to
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for CellChange
|
||
internal extension CellChange {
|
||
init(_ rb: RustBridge.CellChangeRef) throws {
|
||
self.row = rb.row()
|
||
self.col = rb.col()
|
||
self.from = rb.from().toString()
|
||
self.to = rb.to().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.CellChange {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.cellChangeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A single tracked change embedded in a document.
|
||
///
|
||
/// Populated by per-format extractors that understand change-tracking metadata
|
||
/// (DOCX `w:ins`/`w:del`/`w:rPrChange`, ODT `text:change-*`, …). Every
|
||
/// extractor defaults to `ExtractionResult.revisions = None` until a
|
||
/// format-specific implementation is added.
|
||
public struct DocumentRevision: Codable, Sendable, Hashable {
|
||
/// Format-specific revision identifier.
|
||
///
|
||
/// For DOCX this is the `w:id` attribute value on the change element
|
||
/// (e.g. `"42"`). When the attribute is absent a synthetic fallback is
|
||
/// generated (`"docx-ins-0"`, `"docx-del-3"`, …).
|
||
public let revisionId: String
|
||
/// Display name of the author who made this change, when available.
|
||
public let author: String?
|
||
/// ISO-8601 timestamp of the change, when available.
|
||
///
|
||
/// Stored as a plain string so this type remains FFI-friendly and
|
||
/// unconditionally available without the `chrono` optional dep.
|
||
/// DOCX populates this from the `w:date` attribute (e.g.
|
||
/// `"2024-03-15T10:30:00Z"`).
|
||
public let timestamp: String?
|
||
/// Semantic kind of this revision.
|
||
public let kind: RevisionKind
|
||
/// Best-effort document location for this revision.
|
||
///
|
||
/// Resolution is format-dependent and may be `None` when the location
|
||
/// cannot be determined (e.g. changes inside table cells before
|
||
/// table-cell anchor support is added).
|
||
public let anchor: RevisionAnchor?
|
||
/// The content changes that make up this revision.
|
||
public let delta: RevisionDelta
|
||
public init(revisionId: String, author: String? = nil, timestamp: String? = nil, kind: RevisionKind, anchor: RevisionAnchor? = nil, delta: RevisionDelta) {
|
||
self.revisionId = revisionId
|
||
self.author = author
|
||
self.timestamp = timestamp
|
||
self.kind = kind
|
||
self.anchor = anchor
|
||
self.delta = delta
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case revisionId = "revision_id"
|
||
case author = "author"
|
||
case timestamp = "timestamp"
|
||
case kind = "kind"
|
||
case anchor = "anchor"
|
||
case delta = "delta"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DocumentRevision
|
||
internal extension DocumentRevision {
|
||
init(_ rb: RustBridge.DocumentRevisionRef) throws {
|
||
self.revisionId = rb.revisionId().toString()
|
||
self.author = rb.author()?.toString()
|
||
self.timestamp = rb.timestamp()?.toString()
|
||
self.kind = RevisionKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown RevisionKind: \(rb.kind().toString())") }()
|
||
self.anchor = try JSONDecoder().decode(RevisionAnchor?.self, from: ((rb.anchor()?.toString() ?? "null").data(using: .utf8) ?? Data("null".utf8)))
|
||
self.delta = try RevisionDelta(rb.delta())
|
||
}
|
||
func intoRust() throws -> RustBridge.DocumentRevision {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.documentRevisionFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// The content changes that make up a single revision.
|
||
///
|
||
/// For insertions and deletions the `content` field carries the added/removed
|
||
/// lines as `DiffLine::Added` / `DiffLine::Removed` entries. For format
|
||
/// changes, `content` is empty — the property diff is left as a TODO for a
|
||
/// later enrichment pass.
|
||
public struct RevisionDelta: Codable, Sendable, Hashable {
|
||
/// Line-level content changes for this revision.
|
||
public let content: [DiffLine]
|
||
/// Cell-level table changes for this revision.
|
||
public let tableChanges: [CellChange]
|
||
public init(content: [DiffLine], tableChanges: [CellChange]) {
|
||
self.content = content
|
||
self.tableChanges = tableChanges
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case tableChanges = "table_changes"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.content = try container.decodeIfPresent([DiffLine].self, forKey: .content) ?? []
|
||
self.tableChanges = try container.decodeIfPresent([CellChange].self, forKey: .tableChanges) ?? []
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for RevisionDelta
|
||
internal extension RevisionDelta {
|
||
init(_ rb: RustBridge.RevisionDeltaRef) throws {
|
||
self.content = try rb.content().map { (s: RustStringRef) -> DiffLine in let d = s.as_str().toString().data(using: .utf8) ?? Data(); return try JSONDecoder().decode(DiffLine.self, from: d) }
|
||
self.tableChanges = try rb.tableChanges().map { try CellChange($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.RevisionDelta {
|
||
let __content = RustVec<RustBridge.DiffLine>()
|
||
for __elem in self.content { __content.push(value: try __elem.intoRust()) }
|
||
let __tableChanges = RustVec<RustBridge.CellChange>()
|
||
for __elem in self.tableChanges { __tableChanges.push(value: try __elem.intoRust()) }
|
||
return RustBridge.RevisionDelta(__content, __tableChanges)
|
||
}
|
||
}
|
||
|
||
/// Extracted table structure.
|
||
///
|
||
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
||
/// Tables are converted to both structured cell data and Markdown format.
|
||
public struct Table: Codable, Sendable, Hashable {
|
||
/// Table cells as a 2D vector (rows × columns)
|
||
public let cells: [[String]]
|
||
/// Markdown representation of the table
|
||
public let markdown: String
|
||
/// Page number where the table was found (1-indexed)
|
||
public let pageNumber: UInt32
|
||
/// Bounding box of the table on the page (PDF coordinates: x0=left, y0=bottom, x1=right, y1=top).
|
||
/// Only populated for PDF-extracted tables when position data is available.
|
||
public let boundingBox: BoundingBox?
|
||
public init(cells: [[String]], markdown: String, pageNumber: UInt32, boundingBox: BoundingBox? = nil) {
|
||
self.cells = cells
|
||
self.markdown = markdown
|
||
self.pageNumber = pageNumber
|
||
self.boundingBox = boundingBox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case cells = "cells"
|
||
case markdown = "markdown"
|
||
case pageNumber = "page_number"
|
||
case boundingBox = "bounding_box"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.cells = try container.decodeIfPresent([[String]].self, forKey: .cells) ?? []
|
||
self.markdown = try container.decodeIfPresent(String.self, forKey: .markdown) ?? ""
|
||
self.pageNumber = try container.decodeIfPresent(UInt32.self, forKey: .pageNumber) ?? 0
|
||
self.boundingBox = try container.decodeIfPresent(BoundingBox.self, forKey: .boundingBox) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for Table
|
||
internal extension Table {
|
||
init(_ rb: RustBridge.TableRef) throws {
|
||
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
|
||
self.markdown = rb.markdown().toString()
|
||
self.pageNumber = rb.pageNumber()
|
||
self.boundingBox = try rb.boundingBox().map { try BoundingBox($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.Table {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.tableFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Individual table cell with content and optional styling.
|
||
///
|
||
/// Future extension point for rich table support with cell-level metadata.
|
||
public struct TableCell: Codable, Sendable, Hashable {
|
||
/// Cell content as text
|
||
public let content: String
|
||
/// Row span (number of rows this cell spans)
|
||
public let rowSpan: UInt32
|
||
/// Column span (number of columns this cell spans)
|
||
public let colSpan: UInt32
|
||
/// Whether this is a header cell
|
||
public let isHeader: Bool
|
||
public init(content: String, rowSpan: UInt32, colSpan: UInt32, isHeader: Bool) {
|
||
self.content = content
|
||
self.rowSpan = rowSpan
|
||
self.colSpan = colSpan
|
||
self.isHeader = isHeader
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case content = "content"
|
||
case rowSpan = "row_span"
|
||
case colSpan = "col_span"
|
||
case isHeader = "is_header"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.content = try container.decodeIfPresent(String.self, forKey: .content) ?? ""
|
||
self.rowSpan = try container.decodeIfPresent(UInt32.self, forKey: .rowSpan) ?? 0
|
||
self.colSpan = try container.decodeIfPresent(UInt32.self, forKey: .colSpan) ?? 0
|
||
self.isHeader = try container.decodeIfPresent(Bool.self, forKey: .isHeader) ?? false
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TableCell
|
||
internal extension TableCell {
|
||
init(_ rb: RustBridge.TableCellRef) throws {
|
||
self.content = rb.content().toString()
|
||
self.rowSpan = rb.rowSpan()
|
||
self.colSpan = rb.colSpan()
|
||
self.isHeader = rb.isHeader()
|
||
}
|
||
func intoRust() throws -> RustBridge.TableCell {
|
||
return RustBridge.TableCell(RustString(self.content), self.rowSpan, self.colSpan, self.isHeader)
|
||
}
|
||
}
|
||
|
||
/// A URI extracted from a document.
|
||
///
|
||
/// Represents any link, reference, or resource pointer found during extraction.
|
||
/// The `kind` field classifies the URI semantically, while `label` carries
|
||
/// optional human-readable display text.
|
||
public struct ExtractedUri: Codable, Sendable, Hashable {
|
||
/// The URL or path string.
|
||
public let url: String
|
||
/// Optional display text / label for the link.
|
||
public let label: String?
|
||
/// Optional page number where the URI was found (1-indexed).
|
||
public let page: UInt32?
|
||
/// Semantic classification of the URI.
|
||
public let kind: UriKind
|
||
public init(url: String, label: String? = nil, page: UInt32? = nil, kind: UriKind) {
|
||
self.url = url
|
||
self.label = label
|
||
self.page = page
|
||
self.kind = kind
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for ExtractedUri
|
||
internal extension ExtractedUri {
|
||
init(_ rb: RustBridge.ExtractedUriRef) throws {
|
||
self.url = rb.url().toString()
|
||
self.label = rb.label()?.toString()
|
||
self.page = rb.page()
|
||
self.kind = UriKind(rawValue: rb.kind().toString()) ?? { fatalError("Unknown UriKind: \(rb.kind().toString())") }()
|
||
}
|
||
func intoRust() throws -> RustBridge.ExtractedUri {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.extractedUriFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// MIME type detection response.
|
||
public struct DetectResponse: Codable, Sendable, Hashable {
|
||
/// Detected MIME type
|
||
public let mimeType: String
|
||
/// Original filename (if provided)
|
||
public let filename: String?
|
||
public init(mimeType: String, filename: String? = nil) {
|
||
self.mimeType = mimeType
|
||
self.filename = filename
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case mimeType = "mime_type"
|
||
case filename = "filename"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DetectResponse
|
||
internal extension DetectResponse {
|
||
init(_ rb: RustBridge.DetectResponseRef) throws {
|
||
self.mimeType = rb.mimeType().toString()
|
||
self.filename = rb.filename()?.toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.DetectResponse {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.detectResponseFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Options controlling how two `ExtractionResult` values are compared.
|
||
public struct DiffOptions: Codable, Sendable, Hashable {
|
||
/// Include metadata changes in the diff. Default: `true`.
|
||
public let includeMetadata: Bool
|
||
/// Include embedded-children changes in the diff. Default: `true`.
|
||
public let includeEmbedded: Bool
|
||
/// Truncate content to this many characters before diffing.
|
||
///
|
||
/// Useful for very large documents where only the first N characters matter.
|
||
/// `None` means no truncation.
|
||
public let maxContentChars: UInt?
|
||
public init(includeMetadata: Bool, includeEmbedded: Bool, maxContentChars: UInt? = nil) {
|
||
self.includeMetadata = includeMetadata
|
||
self.includeEmbedded = includeEmbedded
|
||
self.maxContentChars = maxContentChars
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case includeMetadata = "include_metadata"
|
||
case includeEmbedded = "include_embedded"
|
||
case maxContentChars = "max_content_chars"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.includeMetadata = try container.decodeIfPresent(Bool.self, forKey: .includeMetadata) ?? true
|
||
self.includeEmbedded = try container.decodeIfPresent(Bool.self, forKey: .includeEmbedded) ?? true
|
||
self.maxContentChars = try container.decodeIfPresent(UInt.self, forKey: .maxContentChars) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DiffOptions
|
||
internal extension DiffOptions {
|
||
init(_ rb: RustBridge.DiffOptionsRef) throws {
|
||
self.includeMetadata = rb.includeMetadata()
|
||
self.includeEmbedded = rb.includeEmbedded()
|
||
self.maxContentChars = rb.maxContentChars()
|
||
}
|
||
func intoRust() throws -> RustBridge.DiffOptions {
|
||
return RustBridge.DiffOptions(self.includeMetadata, self.includeEmbedded, self.maxContentChars)
|
||
}
|
||
}
|
||
|
||
/// The complete diff between two `ExtractionResult` values.
|
||
public typealias ExtractionDiff = RustBridge.ExtractionDiff
|
||
|
||
/// A single contiguous hunk in a unified diff.
|
||
public struct DiffHunk: Codable, Sendable, Hashable {
|
||
/// Starting line number in the old content (0-indexed).
|
||
public let fromLine: UInt
|
||
/// Number of lines from the old content in this hunk.
|
||
public let fromCount: UInt
|
||
/// Starting line number in the new content (0-indexed).
|
||
public let toLine: UInt
|
||
/// Number of lines from the new content in this hunk.
|
||
public let toCount: UInt
|
||
/// Lines that make up this hunk.
|
||
public let lines: [DiffLine]
|
||
public init(fromLine: UInt, fromCount: UInt, toLine: UInt, toCount: UInt, lines: [DiffLine]) {
|
||
self.fromLine = fromLine
|
||
self.fromCount = fromCount
|
||
self.toLine = toLine
|
||
self.toCount = toCount
|
||
self.lines = lines
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case fromLine = "from_line"
|
||
case fromCount = "from_count"
|
||
case toLine = "to_line"
|
||
case toCount = "to_count"
|
||
case lines = "lines"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DiffHunk
|
||
internal extension DiffHunk {
|
||
init(_ rb: RustBridge.DiffHunkRef) throws {
|
||
self.fromLine = rb.fromLine()
|
||
self.fromCount = rb.fromCount()
|
||
self.toLine = rb.toLine()
|
||
self.toCount = rb.toCount()
|
||
self.lines = try rb.lines().map { (s: RustStringRef) -> DiffLine in let d = s.as_str().toString().data(using: .utf8) ?? Data(); return try JSONDecoder().decode(DiffLine.self, from: d) }
|
||
}
|
||
func intoRust() throws -> RustBridge.DiffHunk {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.diffHunkFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Cell-level changes for a pair of tables that share the same index.
|
||
public struct TableDiff: Codable, Sendable, Hashable {
|
||
/// Zero-based index of the table in both `a.tables` and `b.tables`.
|
||
public let fromIndex: UInt
|
||
/// Zero-based index in `b.tables` (equal to `from_index` for same-dimension tables).
|
||
public let toIndex: UInt
|
||
/// Cell-level changes within the table.
|
||
public let cellChanges: [CellChange]
|
||
public init(fromIndex: UInt, toIndex: UInt, cellChanges: [CellChange]) {
|
||
self.fromIndex = fromIndex
|
||
self.toIndex = toIndex
|
||
self.cellChanges = cellChanges
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case fromIndex = "from_index"
|
||
case toIndex = "to_index"
|
||
case cellChanges = "cell_changes"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for TableDiff
|
||
internal extension TableDiff {
|
||
init(_ rb: RustBridge.TableDiffRef) throws {
|
||
self.fromIndex = rb.fromIndex()
|
||
self.toIndex = rb.toIndex()
|
||
self.cellChanges = try rb.cellChanges().map { try CellChange($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.TableDiff {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.tableDiffFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Changes to embedded archive children between two results.
|
||
public typealias EmbeddedChanges = RustBridge.EmbeddedChanges
|
||
|
||
/// Diff for a single embedded archive entry that appears in both results.
|
||
public typealias EmbeddedDiff = RustBridge.EmbeddedDiff
|
||
|
||
/// Preset configurations for common RAG use cases.
|
||
///
|
||
/// Each preset combines chunk size, overlap, and embedding model
|
||
/// to provide an optimized configuration for specific scenarios.
|
||
///
|
||
/// All string fields are owned `String` for FFI compatibility — instances
|
||
/// are safe to clone and pass across language boundaries.
|
||
public struct EmbeddingPreset: Codable, Sendable, Hashable {
|
||
public let name: String
|
||
public let chunkSize: UInt
|
||
public let overlap: UInt
|
||
/// HuggingFace repository name for the model.
|
||
public let modelRepo: String
|
||
/// Pooling strategy: "cls" or "mean".
|
||
public let pooling: String
|
||
/// Path to the ONNX model file within the repo.
|
||
public let modelFile: String
|
||
public let dimensions: UInt
|
||
public let description: String
|
||
public init(name: String, chunkSize: UInt, overlap: UInt, modelRepo: String, pooling: String, modelFile: String, dimensions: UInt, description: String) {
|
||
self.name = name
|
||
self.chunkSize = chunkSize
|
||
self.overlap = overlap
|
||
self.modelRepo = modelRepo
|
||
self.pooling = pooling
|
||
self.modelFile = modelFile
|
||
self.dimensions = dimensions
|
||
self.description = description
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case name = "name"
|
||
case chunkSize = "chunk_size"
|
||
case overlap = "overlap"
|
||
case modelRepo = "model_repo"
|
||
case pooling = "pooling"
|
||
case modelFile = "model_file"
|
||
case dimensions = "dimensions"
|
||
case description = "description"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for EmbeddingPreset
|
||
internal extension EmbeddingPreset {
|
||
init(_ rb: RustBridge.EmbeddingPresetRef) throws {
|
||
self.name = rb.name().toString()
|
||
self.chunkSize = rb.chunkSize()
|
||
self.overlap = rb.overlap()
|
||
self.modelRepo = rb.modelRepo().toString()
|
||
self.pooling = rb.pooling().toString()
|
||
self.modelFile = rb.modelFile().toString()
|
||
self.dimensions = rb.dimensions()
|
||
self.description = rb.description().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.EmbeddingPreset {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.embeddingPresetFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// YAKE-specific parameters.
|
||
public struct YakeParams: Codable, Sendable, Hashable {
|
||
/// Window size for co-occurrence analysis (default: 2).
|
||
///
|
||
/// Controls the context window for computing co-occurrence statistics.
|
||
public let windowSize: UInt
|
||
public init(windowSize: UInt) {
|
||
self.windowSize = windowSize
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case windowSize = "window_size"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.windowSize = try container.decodeIfPresent(UInt.self, forKey: .windowSize) ?? 2
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for YakeParams
|
||
internal extension YakeParams {
|
||
init(_ rb: RustBridge.YakeParamsRef) throws {
|
||
self.windowSize = rb.windowSize()
|
||
}
|
||
func intoRust() throws -> RustBridge.YakeParams {
|
||
return RustBridge.YakeParams(self.windowSize)
|
||
}
|
||
}
|
||
|
||
/// RAKE-specific parameters.
|
||
public struct RakeParams: Codable, Sendable, Hashable {
|
||
/// Minimum word length to consider (default: 1).
|
||
public let minWordLength: UInt
|
||
/// Maximum words in a keyword phrase (default: 3).
|
||
public let maxWordsPerPhrase: UInt
|
||
public init(minWordLength: UInt, maxWordsPerPhrase: UInt) {
|
||
self.minWordLength = minWordLength
|
||
self.maxWordsPerPhrase = maxWordsPerPhrase
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case minWordLength = "min_word_length"
|
||
case maxWordsPerPhrase = "max_words_per_phrase"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.minWordLength = try container.decodeIfPresent(UInt.self, forKey: .minWordLength) ?? 1
|
||
self.maxWordsPerPhrase = try container.decodeIfPresent(UInt.self, forKey: .maxWordsPerPhrase) ?? 3
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for RakeParams
|
||
internal extension RakeParams {
|
||
init(_ rb: RustBridge.RakeParamsRef) throws {
|
||
self.minWordLength = rb.minWordLength()
|
||
self.maxWordsPerPhrase = rb.maxWordsPerPhrase()
|
||
}
|
||
func intoRust() throws -> RustBridge.RakeParams {
|
||
return RustBridge.RakeParams(self.minWordLength, self.maxWordsPerPhrase)
|
||
}
|
||
}
|
||
|
||
/// Keyword extraction configuration.
|
||
public struct KeywordConfig: Codable, Sendable, Hashable {
|
||
/// Algorithm to use for extraction.
|
||
public let algorithm: KeywordAlgorithm
|
||
/// Maximum number of keywords to extract (default: 10).
|
||
public let maxKeywords: UInt
|
||
/// Minimum score threshold (0.0-1.0, default: 0.0).
|
||
///
|
||
/// Keywords with scores below this threshold are filtered out.
|
||
/// Note: Score ranges differ between algorithms.
|
||
public let minScore: Float
|
||
/// N-gram range for keyword extraction (min, max).
|
||
///
|
||
/// (1, 1) = unigrams only
|
||
/// (1, 2) = unigrams and bigrams
|
||
/// (1, 3) = unigrams, bigrams, and trigrams (default)
|
||
public let ngramRange: [UInt]
|
||
/// Language code for stopword filtering (e.g., "en", "de", "fr").
|
||
///
|
||
/// If None, no stopword filtering is applied.
|
||
public let language: String?
|
||
/// YAKE-specific tuning parameters.
|
||
public let yakeParams: YakeParams?
|
||
/// RAKE-specific tuning parameters.
|
||
public let rakeParams: RakeParams?
|
||
public init(algorithm: KeywordAlgorithm, maxKeywords: UInt, minScore: Float, ngramRange: [UInt], language: String? = nil, yakeParams: YakeParams? = nil, rakeParams: RakeParams? = nil) {
|
||
self.algorithm = algorithm
|
||
self.maxKeywords = maxKeywords
|
||
self.minScore = minScore
|
||
self.ngramRange = ngramRange
|
||
self.language = language
|
||
self.yakeParams = yakeParams
|
||
self.rakeParams = rakeParams
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case algorithm = "algorithm"
|
||
case maxKeywords = "max_keywords"
|
||
case minScore = "min_score"
|
||
case ngramRange = "ngram_range"
|
||
case language = "language"
|
||
case yakeParams = "yake_params"
|
||
case rakeParams = "rake_params"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.algorithm = try container.decode(KeywordAlgorithm.self, forKey: .algorithm)
|
||
self.maxKeywords = try container.decodeIfPresent(UInt.self, forKey: .maxKeywords) ?? 10
|
||
self.minScore = try container.decodeIfPresent(Float.self, forKey: .minScore) ?? 0.0
|
||
self.ngramRange = try container.decodeIfPresent([UInt].self, forKey: .ngramRange) ?? []
|
||
self.language = try container.decodeIfPresent(String.self, forKey: .language) ?? nil
|
||
self.yakeParams = try container.decodeIfPresent(YakeParams.self, forKey: .yakeParams) ?? nil
|
||
self.rakeParams = try container.decodeIfPresent(RakeParams.self, forKey: .rakeParams) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for KeywordConfig
|
||
internal extension KeywordConfig {
|
||
init(_ rb: RustBridge.KeywordConfigRef) throws {
|
||
self.algorithm = KeywordAlgorithm(rawValue: rb.algorithm().toString()) ?? { fatalError("Unknown KeywordAlgorithm: \(rb.algorithm().toString())") }()
|
||
self.maxKeywords = rb.maxKeywords()
|
||
self.minScore = rb.minScore()
|
||
self.ngramRange = Array(rb.ngramRange())
|
||
self.language = rb.language()?.toString()
|
||
self.yakeParams = try rb.yakeParams().map { try YakeParams($0) }
|
||
self.rakeParams = try rb.rakeParams().map { try RakeParams($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.KeywordConfig {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.keywordConfigFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Extracted keyword with metadata.
|
||
public struct Keyword: Codable, Sendable, Hashable {
|
||
/// The keyword text.
|
||
public let text: String
|
||
/// Relevance score (higher is better, algorithm-specific range).
|
||
public let score: Float
|
||
/// Algorithm that extracted this keyword.
|
||
public let algorithm: KeywordAlgorithm
|
||
/// Optional positions where keyword appears in text (character offsets).
|
||
public let positions: [UInt]?
|
||
public init(text: String, score: Float, algorithm: KeywordAlgorithm, positions: [UInt]? = nil) {
|
||
self.text = text
|
||
self.score = score
|
||
self.algorithm = algorithm
|
||
self.positions = positions
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for Keyword
|
||
internal extension Keyword {
|
||
init(_ rb: RustBridge.KeywordRef) throws {
|
||
self.text = rb.text().toString()
|
||
self.score = rb.score()
|
||
self.algorithm = KeywordAlgorithm(rawValue: rb.algorithm().toString()) ?? { fatalError("Unknown KeywordAlgorithm: \(rb.algorithm().toString())") }()
|
||
self.positions = rb.positions().map { Array($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.Keyword {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.keywordFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Configuration for PaddleOCR backend.
|
||
///
|
||
/// Configures PaddleOCR text detection and recognition with multi-language support.
|
||
/// Uses a builder pattern for convenient configuration.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// ```no_run
|
||
/// use kreuzberg::PaddleOcrConfig;
|
||
///
|
||
/// // Create with default English configuration
|
||
/// let config = PaddleOcrConfig::new("en");
|
||
///
|
||
/// // Create with custom cache directory
|
||
/// let config = PaddleOcrConfig::new("ch")
|
||
/// .with_cache_dir("/path/to/cache".into());
|
||
///
|
||
/// // Enable table detection
|
||
/// let config = PaddleOcrConfig::new("en")
|
||
/// .with_table_detection(true);
|
||
/// ```
|
||
public typealias PaddleOcrConfig = RustBridge.PaddleOcrConfig
|
||
|
||
/// Combined paths to all models needed for OCR (backward compatibility).
|
||
public typealias ModelPaths = RustBridge.ModelPaths
|
||
|
||
/// Document orientation detection result.
|
||
public struct OrientationResult: Codable, Sendable, Hashable {
|
||
/// Detected orientation in degrees (0, 90, 180, or 270).
|
||
public let degrees: UInt32
|
||
/// Confidence score (0.0-1.0).
|
||
public let confidence: Float
|
||
public init(degrees: UInt32, confidence: Float) {
|
||
self.degrees = degrees
|
||
self.confidence = confidence
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for OrientationResult
|
||
internal extension OrientationResult {
|
||
init(_ rb: RustBridge.OrientationResultRef) throws {
|
||
self.degrees = rb.degrees()
|
||
self.confidence = rb.confidence()
|
||
}
|
||
func intoRust() throws -> RustBridge.OrientationResult {
|
||
return RustBridge.OrientationResult(self.degrees, self.confidence)
|
||
}
|
||
}
|
||
|
||
/// Bounding box in original image coordinates (x1, y1) top-left, (x2, y2) bottom-right.
|
||
public struct BBox: Codable, Sendable, Hashable {
|
||
public let x1: Float
|
||
public let y1: Float
|
||
public let x2: Float
|
||
public let y2: Float
|
||
public init(x1: Float, y1: Float, x2: Float, y2: Float) {
|
||
self.x1 = x1
|
||
self.y1 = y1
|
||
self.x2 = x2
|
||
self.y2 = y2
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for BBox
|
||
internal extension BBox {
|
||
init(_ rb: RustBridge.BBoxRef) throws {
|
||
self.x1 = rb.x1()
|
||
self.y1 = rb.y1()
|
||
self.x2 = rb.x2()
|
||
self.y2 = rb.y2()
|
||
}
|
||
func intoRust() throws -> RustBridge.BBox {
|
||
return RustBridge.BBox(self.x1, self.y1, self.x2, self.y2)
|
||
}
|
||
}
|
||
|
||
/// A single layout detection result.
|
||
public struct LayoutDetection: Codable, Sendable, Hashable {
|
||
public let className: LayoutClass
|
||
public let confidence: Float
|
||
public let bbox: BBox
|
||
public init(className: LayoutClass, confidence: Float, bbox: BBox) {
|
||
self.className = className
|
||
self.confidence = confidence
|
||
self.bbox = bbox
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case className = "class_name"
|
||
case confidence = "confidence"
|
||
case bbox = "bbox"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for LayoutDetection
|
||
internal extension LayoutDetection {
|
||
init(_ rb: RustBridge.LayoutDetectionRef) throws {
|
||
self.className = LayoutClass(rawValue: rb.className().toString()) ?? { fatalError("Unknown LayoutClass: \(rb.className().toString())") }()
|
||
self.confidence = rb.confidence()
|
||
self.bbox = try BBox(rb.bbox())
|
||
}
|
||
func intoRust() throws -> RustBridge.LayoutDetection {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.layoutDetectionFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Pre-computed table markdown for a table detection region.
|
||
///
|
||
/// Produced by the TATR-based table structure recognizer and surfaced as part of
|
||
/// layout-aware OCR results. The struct lives here (under `layout-types`, pure-Rust)
|
||
/// so that consumers who do not enable `layout-detection` (ORT) can still reference
|
||
/// the type in their own code.
|
||
public struct RecognizedTable: Codable, Sendable, Hashable {
|
||
/// Detection bbox that this table corresponds to (for matching).
|
||
public let detectionBbox: BBox
|
||
/// Table cells as a 2D vector (rows × columns).
|
||
public let cells: [[String]]
|
||
/// Rendered markdown table.
|
||
public let markdown: String
|
||
public init(detectionBbox: BBox, cells: [[String]], markdown: String) {
|
||
self.detectionBbox = detectionBbox
|
||
self.cells = cells
|
||
self.markdown = markdown
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case detectionBbox = "detection_bbox"
|
||
case cells = "cells"
|
||
case markdown = "markdown"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for RecognizedTable
|
||
internal extension RecognizedTable {
|
||
init(_ rb: RustBridge.RecognizedTableRef) throws {
|
||
self.detectionBbox = try BBox(rb.detectionBbox())
|
||
self.cells = try JSONDecoder().decode([[String]].self, from: ((rb.cells().toString()).data(using: .utf8) ?? Data("null".utf8)))
|
||
self.markdown = rb.markdown().toString()
|
||
}
|
||
func intoRust() throws -> RustBridge.RecognizedTable {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.recognizedTableFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Page-level detection result containing all detections and page metadata.
|
||
public struct DetectionResult: Codable, Sendable, Hashable {
|
||
public let pageWidth: UInt32
|
||
public let pageHeight: UInt32
|
||
public let detections: [LayoutDetection]
|
||
public init(pageWidth: UInt32, pageHeight: UInt32, detections: [LayoutDetection]) {
|
||
self.pageWidth = pageWidth
|
||
self.pageHeight = pageHeight
|
||
self.detections = detections
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case pageWidth = "page_width"
|
||
case pageHeight = "page_height"
|
||
case detections = "detections"
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for DetectionResult
|
||
internal extension DetectionResult {
|
||
init(_ rb: RustBridge.DetectionResultRef) throws {
|
||
self.pageWidth = rb.pageWidth()
|
||
self.pageHeight = rb.pageHeight()
|
||
self.detections = try rb.detections().map { try LayoutDetection($0) }
|
||
}
|
||
func intoRust() throws -> RustBridge.DetectionResult {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "{}"
|
||
return try RustBridge.detectionResultFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Embedded file descriptor extracted from the PDF name tree.
|
||
public typealias EmbeddedFile = RustBridge.EmbeddedFile
|
||
|
||
/// PDF-specific metadata.
|
||
///
|
||
/// Contains metadata fields specific to PDF documents that are not in the common
|
||
/// `Metadata` structure. Common fields like title, authors, keywords, and dates
|
||
/// are at the `Metadata` level.
|
||
public struct PdfMetadata: Codable, Sendable, Hashable {
|
||
/// PDF version (e.g., "1.7", "2.0")
|
||
public let pdfVersion: String?
|
||
/// PDF producer (application that created the PDF)
|
||
public let producer: String?
|
||
/// Whether the PDF is encrypted/password-protected
|
||
public let isEncrypted: Bool?
|
||
/// First page width in points (1/72 inch)
|
||
public let width: Int64?
|
||
/// First page height in points (1/72 inch)
|
||
public let height: Int64?
|
||
/// Total number of pages in the PDF document
|
||
public let pageCount: UInt32?
|
||
public init(pdfVersion: String? = nil, producer: String? = nil, isEncrypted: Bool? = nil, width: Int64? = nil, height: Int64? = nil, pageCount: UInt32? = nil) {
|
||
self.pdfVersion = pdfVersion
|
||
self.producer = producer
|
||
self.isEncrypted = isEncrypted
|
||
self.width = width
|
||
self.height = height
|
||
self.pageCount = pageCount
|
||
}
|
||
private enum CodingKeys: String, CodingKey {
|
||
case pdfVersion = "pdf_version"
|
||
case producer = "producer"
|
||
case isEncrypted = "is_encrypted"
|
||
case width = "width"
|
||
case height = "height"
|
||
case pageCount = "page_count"
|
||
}
|
||
public init(from decoder: any Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
self.pdfVersion = try container.decodeIfPresent(String.self, forKey: .pdfVersion) ?? nil
|
||
self.producer = try container.decodeIfPresent(String.self, forKey: .producer) ?? nil
|
||
self.isEncrypted = try container.decodeIfPresent(Bool.self, forKey: .isEncrypted) ?? nil
|
||
self.width = try container.decodeIfPresent(Int64.self, forKey: .width) ?? nil
|
||
self.height = try container.decodeIfPresent(Int64.self, forKey: .height) ?? nil
|
||
self.pageCount = try container.decodeIfPresent(UInt32.self, forKey: .pageCount) ?? nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Internal FFI conversions for PdfMetadata
|
||
internal extension PdfMetadata {
|
||
init(_ rb: RustBridge.PdfMetadataRef) throws {
|
||
self.pdfVersion = rb.pdfVersion()?.toString()
|
||
self.producer = rb.producer()?.toString()
|
||
self.isEncrypted = rb.isEncrypted()
|
||
self.width = rb.width()
|
||
self.height = rb.height()
|
||
self.pageCount = rb.pageCount()
|
||
}
|
||
func intoRust() throws -> RustBridge.PdfMetadata {
|
||
return RustBridge.PdfMetadata(self.pdfVersion.map(RustString.init), self.producer.map(RustString.init), self.isEncrypted, self.width, self.height, self.pageCount)
|
||
}
|
||
}
|
||
|
||
/// ONNX Runtime execution provider type.
|
||
///
|
||
/// Determines which hardware backend is used for model inference.
|
||
/// `Auto` (default) selects the best available provider per platform.
|
||
public enum ExecutionProviderType: String, Codable, Sendable, Hashable {
|
||
/// Auto-select: CoreML on macOS, CUDA on Linux, CPU elsewhere.
|
||
case auto
|
||
/// CPU execution provider (always available).
|
||
case cpu
|
||
/// Apple CoreML (macOS/iOS Neural Engine + GPU).
|
||
case coreMl = "coreml"
|
||
/// NVIDIA CUDA GPU acceleration.
|
||
case cuda
|
||
/// NVIDIA TensorRT (optimized CUDA inference).
|
||
case tensorRt = "tensorrt"
|
||
}
|
||
extension ExecutionProviderType {
|
||
func intoRust() throws -> RustBridge.ExecutionProviderType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.executionProviderTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Output format for extraction results.
|
||
///
|
||
/// Controls the format of the `content` field in `ExtractionResult`.
|
||
/// When set to `Markdown`, `Djot`, or `Html`, the output uses that format.
|
||
/// `Plain` returns the raw extracted text.
|
||
/// `Structured` returns JSON with full OCR element data including bounding
|
||
/// boxes and confidence scores.
|
||
public enum OutputFormat: Codable, Sendable, Hashable {
|
||
/// Plain text content only (default)
|
||
case plain
|
||
/// Markdown format
|
||
case markdown
|
||
/// Djot markup format
|
||
case djot
|
||
/// HTML format
|
||
case html
|
||
/// JSON tree format with heading-driven sections.
|
||
case json
|
||
/// Structured JSON format with full OCR element metadata.
|
||
case structured
|
||
/// Custom renderer registered via the RendererRegistry.
|
||
/// The string is the renderer name (e.g., "docx", "latex").
|
||
case custom(field0: String)
|
||
}
|
||
extension OutputFormat {
|
||
func intoRust() throws -> RustBridge.OutputFormat {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.outputFormatFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Built-in HTML theme selection.
|
||
public enum HtmlTheme: String, Codable, Sendable, Hashable {
|
||
/// Sensible defaults: system font stack, neutral colours, readable line
|
||
/// measure. CSS custom properties (`--kb-*`) are all defined so user CSS
|
||
/// can override individual values.
|
||
case `default`
|
||
/// GitHub Markdown-inspired palette and spacing.
|
||
case gitHub = "github"
|
||
/// Dark background, light text.
|
||
case dark
|
||
/// Minimal light theme with generous whitespace.
|
||
case light
|
||
/// No built-in stylesheet emitted. CSS custom properties are still defined
|
||
/// on `:root` so user stylesheets can reference `var(--kb-*)` tokens.
|
||
case unstyled
|
||
}
|
||
extension HtmlTheme {
|
||
func intoRust() throws -> RustBridge.HtmlTheme {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.htmlThemeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Which table structure recognition model to use.
|
||
///
|
||
/// Controls the model used for table cell detection within layout-detected
|
||
/// table regions. Wire format is snake_case in all serializers (JSON, TOML,
|
||
/// YAML).
|
||
public enum TableModel: String, Codable, Sendable, Hashable {
|
||
/// TATR (Table Transformer) -- default, 30MB, DETR-based row/column detection.
|
||
case tatr
|
||
/// SLANeXT wired variant -- 365MB, optimized for bordered tables.
|
||
case slanetWired = "slanet_wired"
|
||
/// SLANeXT wireless variant -- 365MB, optimized for borderless tables.
|
||
case slanetWireless = "slanet_wireless"
|
||
/// SLANet-plus -- 7.78MB, lightweight general-purpose.
|
||
case slanetPlus = "slanet_plus"
|
||
/// Classifier-routed SLANeXT: auto-select wired/wireless per table.
|
||
/// Uses PP-LCNet classifier (6.78MB) + both SLANeXT variants (730MB total).
|
||
case slanetAuto = "slanet_auto"
|
||
/// Disable table structure model inference entirely; use heuristic path only.
|
||
case disabled
|
||
}
|
||
extension TableModel {
|
||
func intoRust() throws -> RustBridge.TableModel {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.tableModelFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Type of text chunker to use.
|
||
///
|
||
/// # Variants
|
||
///
|
||
/// * `Text` - Generic text splitter, splits on whitespace and punctuation
|
||
/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
|
||
/// * `Yaml` - YAML-aware splitter, creates one chunk per top-level key
|
||
/// * `Semantic` - Topic-aware chunker. With an `EmbeddingConfig`, splits at
|
||
/// embedding-based topic shifts tuned by `topic_threshold` (default 0.75,
|
||
/// lower = more splits). Without an embedding, falls back to a
|
||
/// structural-boundary heuristic (ALL-CAPS headers, numbered sections,
|
||
/// blank-line paragraphs) and merges groups into chunks capped at
|
||
/// `max_characters` (default 1000). `topic_threshold` has no effect in the
|
||
/// fallback path. For best results, pair with an embedding model.
|
||
public enum ChunkerType: String, Codable, Sendable, Hashable {
|
||
case text
|
||
case markdown
|
||
case yaml
|
||
case semantic
|
||
}
|
||
extension ChunkerType {
|
||
func intoRust() throws -> RustBridge.ChunkerType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.chunkerTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// How chunk size is measured.
|
||
///
|
||
/// Defaults to `Characters` (Unicode character count). When using token-based sizing,
|
||
/// chunks are sized by token count according to the specified tokenizer.
|
||
///
|
||
/// Token-based sizing uses HuggingFace tokenizers loaded at runtime. Any tokenizer
|
||
/// available on HuggingFace Hub can be used, including OpenAI-compatible tokenizers
|
||
/// (e.g., `Xenova/gpt-4o`, `Xenova/cl100k_base`).
|
||
public enum ChunkSizing: Codable, Sendable, Hashable {
|
||
/// Size measured in Unicode characters (default).
|
||
case characters
|
||
/// Size measured in tokens from a HuggingFace tokenizer.
|
||
case tokenizer(model: String, cacheDir: URL?)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case type
|
||
case cacheDir = "cache_dir"
|
||
case model
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .type)
|
||
switch type {
|
||
case "characters":
|
||
self = .characters
|
||
case "tokenizer":
|
||
self = .tokenizer(model: try container.decode(String.self, forKey: .model), cacheDir: try container.decodeIfPresent(URL.self, forKey: .cacheDir))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .type,
|
||
in: container,
|
||
debugDescription: "Unknown ChunkSizing type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .characters:
|
||
try container.encode("characters", forKey: .type)
|
||
case .tokenizer(let model, let cacheDir):
|
||
try container.encode("tokenizer", forKey: .type)
|
||
try container.encode(model, forKey: .model)
|
||
try container.encodeIfPresent(cacheDir, forKey: .cacheDir)
|
||
}
|
||
}
|
||
}
|
||
extension ChunkSizing {
|
||
func intoRust() throws -> RustBridge.ChunkSizing {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.chunkSizingFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Embedding model types supported by Kreuzberg.
|
||
public enum EmbeddingModelType: Codable, Sendable, Hashable {
|
||
/// Use a preset model configuration (recommended)
|
||
case preset(name: String)
|
||
/// Use a custom ONNX model from HuggingFace
|
||
case custom(modelId: String, dimensions: UInt)
|
||
/// Provider-hosted embedding model via liter-llm.
|
||
///
|
||
/// Uses the model specified in the nested `LlmConfig` (e.g.,
|
||
/// `"openai/text-embedding-3-small"`).
|
||
case llm(llm: LlmConfig)
|
||
/// In-process embedding backend registered via the plugin system.
|
||
///
|
||
/// The caller registers an [`EmbeddingBackend`](crate::plugins::EmbeddingBackend) once
|
||
/// (e.g. a wrapper around an already-loaded `llama-cpp-python`, `sentence-transformers`,
|
||
/// or tuned ONNX model), then references it by name in config. Kreuzberg calls back
|
||
/// into the registered backend during chunking and standalone embed requests —
|
||
/// no HuggingFace download, no ONNX Runtime requirement, no HTTP sidecar.
|
||
///
|
||
/// When this variant is selected, only the following [`EmbeddingConfig`] fields
|
||
/// apply: `normalize` (post-call L2 normalization) and `max_embed_duration_secs`
|
||
/// (dispatcher timeout). Model-loading fields (`batch_size`, `cache_dir`,
|
||
/// `show_download_progress`, `acceleration`) are ignored — the host owns the
|
||
/// model lifecycle.
|
||
///
|
||
/// Semantic chunking falls back to [`ChunkingConfig::max_characters`] when this variant
|
||
/// is used, since there is no preset to look a chunk-size ceiling up against — size your
|
||
/// context window via `max_characters` directly.
|
||
///
|
||
/// See `register_embedding_backend`.
|
||
case plugin(name: String)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case type
|
||
case dimensions
|
||
case llm
|
||
case modelId = "model_id"
|
||
case name
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .type)
|
||
switch type {
|
||
case "preset":
|
||
self = .preset(name: try container.decode(String.self, forKey: .name))
|
||
case "custom":
|
||
self = .custom(modelId: try container.decode(String.self, forKey: .modelId), dimensions: try container.decode(UInt.self, forKey: .dimensions))
|
||
case "llm":
|
||
self = .llm(llm: try container.decode(LlmConfig.self, forKey: .llm))
|
||
case "plugin":
|
||
self = .plugin(name: try container.decode(String.self, forKey: .name))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .type,
|
||
in: container,
|
||
debugDescription: "Unknown EmbeddingModelType type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .preset(let name):
|
||
try container.encode("preset", forKey: .type)
|
||
try container.encode(name, forKey: .name)
|
||
case .custom(let modelId, let dimensions):
|
||
try container.encode("custom", forKey: .type)
|
||
try container.encode(modelId, forKey: .modelId)
|
||
try container.encode(dimensions, forKey: .dimensions)
|
||
case .llm(let llm):
|
||
try container.encode("llm", forKey: .type)
|
||
try container.encode(llm, forKey: .llm)
|
||
case .plugin(let name):
|
||
try container.encode("plugin", forKey: .type)
|
||
try container.encode(name, forKey: .name)
|
||
}
|
||
}
|
||
}
|
||
extension EmbeddingModelType {
|
||
func intoRust() throws -> RustBridge.EmbeddingModelType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.embeddingModelTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Content rendering mode for code extraction.
|
||
///
|
||
/// Controls how extracted code content is represented in the `content` field
|
||
/// of `ExtractionResult`.
|
||
public enum CodeContentMode: String, Codable, Sendable, Hashable {
|
||
/// Use TSLP semantic chunks as content (default).
|
||
case chunks
|
||
/// Use raw source code as content.
|
||
case raw
|
||
/// Emit function/class headings + docstrings (no code bodies).
|
||
case structure
|
||
}
|
||
extension CodeContentMode {
|
||
func intoRust() throws -> RustBridge.CodeContentMode {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.codeContentModeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Type of list detection.
|
||
public typealias ListType = RustBridge.ListType
|
||
|
||
/// OCR backend types.
|
||
public enum OcrBackendType: String, Codable, Sendable, Hashable {
|
||
/// Tesseract OCR (native Rust binding)
|
||
case tesseract = "Tesseract"
|
||
/// EasyOCR (Python-based, via FFI)
|
||
case easyOcr = "EasyOCR"
|
||
/// PaddleOCR (Python-based, via FFI)
|
||
case paddleOcr = "PaddleOCR"
|
||
/// Custom/third-party OCR backend
|
||
case custom = "Custom"
|
||
}
|
||
extension OcrBackendType {
|
||
func intoRust() throws -> RustBridge.OcrBackendType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.ocrBackendTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Processing stages for post-processors.
|
||
///
|
||
/// Post-processors are executed in stage order (Early → Middle → Late).
|
||
/// Use stages to control the order of post-processing operations.
|
||
public enum ProcessingStage: String, Codable, Sendable, Hashable {
|
||
/// Early stage - foundational processing.
|
||
///
|
||
/// Use for:
|
||
/// - Language detection
|
||
/// - Character encoding normalization
|
||
/// - Entity extraction (NER)
|
||
/// - Text quality scoring
|
||
case early = "Early"
|
||
/// Middle stage - content transformation.
|
||
///
|
||
/// Use for:
|
||
/// - Keyword extraction
|
||
/// - Token reduction
|
||
/// - Text summarization
|
||
/// - Semantic analysis
|
||
case middle = "Middle"
|
||
/// Late stage - final enrichment.
|
||
///
|
||
/// Use for:
|
||
/// - Custom user hooks
|
||
/// - Analytics/logging
|
||
/// - Final validation
|
||
/// - Output formatting
|
||
case late = "Late"
|
||
}
|
||
extension ProcessingStage {
|
||
func intoRust() throws -> RustBridge.ProcessingStage {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.processingStageFromJson(json)
|
||
}
|
||
}
|
||
|
||
public enum ReductionLevel: String, Codable, Sendable, Hashable {
|
||
case off = "Off"
|
||
case light = "Light"
|
||
case moderate = "Moderate"
|
||
case aggressive = "Aggressive"
|
||
case maximum = "Maximum"
|
||
}
|
||
extension ReductionLevel {
|
||
func intoRust() throws -> RustBridge.ReductionLevel {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.reductionLevelFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Type of PDF annotation.
|
||
public enum PdfAnnotationType: String, Codable, Sendable, Hashable {
|
||
/// Sticky note / text annotation
|
||
case text
|
||
/// Highlighted text region
|
||
case highlight
|
||
/// Hyperlink annotation
|
||
case link
|
||
/// Rubber stamp annotation
|
||
case stamp
|
||
/// Underline text markup
|
||
case underline
|
||
/// Strikeout text markup
|
||
case strikeOut = "strike_out"
|
||
/// Any other annotation type
|
||
case other
|
||
}
|
||
extension PdfAnnotationType {
|
||
func intoRust() throws -> RustBridge.PdfAnnotationType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.pdfAnnotationTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Types of block-level elements in Djot.
|
||
public enum BlockType: String, Codable, Sendable, Hashable {
|
||
case paragraph
|
||
case heading
|
||
case blockquote
|
||
case codeBlock = "code_block"
|
||
case listItem = "list_item"
|
||
case orderedList = "ordered_list"
|
||
case bulletList = "bullet_list"
|
||
case taskList = "task_list"
|
||
case definitionList = "definition_list"
|
||
case definitionTerm = "definition_term"
|
||
case definitionDescription = "definition_description"
|
||
case div
|
||
case section
|
||
case thematicBreak = "thematic_break"
|
||
case rawBlock = "raw_block"
|
||
case mathDisplay = "math_display"
|
||
}
|
||
extension BlockType {
|
||
func intoRust() throws -> RustBridge.BlockType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.blockTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Types of inline elements in Djot.
|
||
public enum InlineType: String, Codable, Sendable, Hashable {
|
||
case text
|
||
case strong
|
||
case emphasis
|
||
case highlight
|
||
case `subscript`
|
||
case superscript
|
||
case insert
|
||
case delete
|
||
case code
|
||
case link
|
||
case image
|
||
case span
|
||
case math
|
||
case rawInline = "raw_inline"
|
||
case footnoteRef = "footnote_ref"
|
||
case symbol
|
||
}
|
||
extension InlineType {
|
||
func intoRust() throws -> RustBridge.InlineType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.inlineTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Semantic kind of a relationship between document elements.
|
||
public enum RelationshipKind: String, Codable, Sendable, Hashable {
|
||
/// Footnote marker -> footnote definition.
|
||
case footnoteReference = "footnote_reference"
|
||
/// Citation marker -> bibliography entry.
|
||
case citationReference = "citation_reference"
|
||
/// Internal anchor link (`#id`) -> target heading/element.
|
||
case internalLink = "internal_link"
|
||
/// Caption paragraph -> figure/table it describes.
|
||
case caption
|
||
/// Label -> labeled element (HTML `<label for>`, LaTeX `\label{}`).
|
||
case label
|
||
/// TOC entry -> target section.
|
||
case tocEntry = "toc_entry"
|
||
/// Cross-reference (LaTeX `\ref{}`, DOCX cross-reference field).
|
||
case crossReference = "cross_reference"
|
||
}
|
||
extension RelationshipKind {
|
||
func intoRust() throws -> RustBridge.RelationshipKind {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.relationshipKindFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Content layer classification for document nodes.
|
||
///
|
||
/// Replaces separate body/furniture arrays with per-node granularity.
|
||
public enum ContentLayer: String, Codable, Sendable, Hashable {
|
||
/// Main document body content.
|
||
case body
|
||
/// Page/section header (running header).
|
||
case header
|
||
/// Page/section footer (running footer).
|
||
case footer
|
||
/// Footnote content.
|
||
case footnote
|
||
}
|
||
extension ContentLayer {
|
||
func intoRust() throws -> RustBridge.ContentLayer {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.contentLayerFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Tagged enum for node content. Each variant carries only type-specific data.
|
||
///
|
||
/// Uses `#[serde(tag = "node_type")]` to avoid "type" keyword collision in
|
||
/// Go/Java/TypeScript bindings.
|
||
public enum NodeContent: Codable, Sendable, Hashable {
|
||
/// Document title.
|
||
case title(text: String)
|
||
/// Section heading with level (1-6).
|
||
case heading(level: UInt8, text: String)
|
||
/// Body text paragraph.
|
||
case paragraph(text: String)
|
||
/// List container — children are `ListItem` nodes.
|
||
case list(ordered: Bool)
|
||
/// Individual list item.
|
||
case listItem(text: String)
|
||
/// Table with structured cell grid.
|
||
case table(grid: TableGrid)
|
||
/// Image reference.
|
||
case image(description: String?, imageIndex: UInt32?, src: String?)
|
||
/// Code block.
|
||
case code(text: String, language: String?)
|
||
/// Block quote — container, children carry the quoted content.
|
||
case quote
|
||
/// Mathematical formula / equation.
|
||
case formula(text: String)
|
||
/// Footnote reference content.
|
||
case footnote(text: String)
|
||
/// Logical grouping container (section, key-value area).
|
||
///
|
||
/// `heading_level` + `heading_text` capture the section heading directly
|
||
/// rather than relying on a first-child positional convention.
|
||
case group(label: String?, headingLevel: UInt8?, headingText: String?)
|
||
/// Page break marker.
|
||
case pageBreak
|
||
/// Presentation slide container — children are the slide's content nodes.
|
||
case slide(number: UInt32, title: String?)
|
||
/// Definition list container — children are `DefinitionItem` nodes.
|
||
case definitionList
|
||
/// Individual definition list entry with term and definition.
|
||
case definitionItem(term: String, definition: String)
|
||
/// Citation or bibliographic reference.
|
||
case citation(key: String, text: String)
|
||
/// Admonition / callout container (note, warning, tip, etc.).
|
||
///
|
||
/// Children carry the admonition body content.
|
||
case admonition(kind: String, title: String?)
|
||
/// Raw block preserved verbatim from the source format.
|
||
///
|
||
/// Used for content that cannot be mapped to a semantic node type
|
||
/// (e.g. JSX in MDX, raw LaTeX in markdown, embedded HTML).
|
||
case rawBlock(format: String, content: String)
|
||
/// Structured metadata block (email headers, YAML frontmatter, etc.).
|
||
case metadataBlock(entries: [[String]])
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case node_type
|
||
case content
|
||
case definition
|
||
case description
|
||
case entries
|
||
case format
|
||
case grid
|
||
case headingLevel = "heading_level"
|
||
case headingText = "heading_text"
|
||
case imageIndex = "image_index"
|
||
case key
|
||
case kind
|
||
case label
|
||
case language
|
||
case level
|
||
case number
|
||
case ordered
|
||
case src
|
||
case term
|
||
case text
|
||
case title
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .node_type)
|
||
switch type {
|
||
case "title":
|
||
self = .title(text: try container.decode(String.self, forKey: .text))
|
||
case "heading":
|
||
self = .heading(level: try container.decode(UInt8.self, forKey: .level), text: try container.decode(String.self, forKey: .text))
|
||
case "paragraph":
|
||
self = .paragraph(text: try container.decode(String.self, forKey: .text))
|
||
case "list":
|
||
self = .list(ordered: try container.decode(Bool.self, forKey: .ordered))
|
||
case "list_item":
|
||
self = .listItem(text: try container.decode(String.self, forKey: .text))
|
||
case "table":
|
||
self = .table(grid: try container.decode(TableGrid.self, forKey: .grid))
|
||
case "image":
|
||
self = .image(description: try container.decodeIfPresent(String.self, forKey: .description), imageIndex: try container.decodeIfPresent(UInt32.self, forKey: .imageIndex), src: try container.decodeIfPresent(String.self, forKey: .src))
|
||
case "code":
|
||
self = .code(text: try container.decode(String.self, forKey: .text), language: try container.decodeIfPresent(String.self, forKey: .language))
|
||
case "quote":
|
||
self = .quote
|
||
case "formula":
|
||
self = .formula(text: try container.decode(String.self, forKey: .text))
|
||
case "footnote":
|
||
self = .footnote(text: try container.decode(String.self, forKey: .text))
|
||
case "group":
|
||
self = .group(label: try container.decodeIfPresent(String.self, forKey: .label), headingLevel: try container.decodeIfPresent(UInt8.self, forKey: .headingLevel), headingText: try container.decodeIfPresent(String.self, forKey: .headingText))
|
||
case "page_break":
|
||
self = .pageBreak
|
||
case "slide":
|
||
self = .slide(number: try container.decode(UInt32.self, forKey: .number), title: try container.decodeIfPresent(String.self, forKey: .title))
|
||
case "definition_list":
|
||
self = .definitionList
|
||
case "definition_item":
|
||
self = .definitionItem(term: try container.decode(String.self, forKey: .term), definition: try container.decode(String.self, forKey: .definition))
|
||
case "citation":
|
||
self = .citation(key: try container.decode(String.self, forKey: .key), text: try container.decode(String.self, forKey: .text))
|
||
case "admonition":
|
||
self = .admonition(kind: try container.decode(String.self, forKey: .kind), title: try container.decodeIfPresent(String.self, forKey: .title))
|
||
case "raw_block":
|
||
self = .rawBlock(format: try container.decode(String.self, forKey: .format), content: try container.decode(String.self, forKey: .content))
|
||
case "metadata_block":
|
||
self = .metadataBlock(entries: try container.decode([[String]].self, forKey: .entries))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .node_type,
|
||
in: container,
|
||
debugDescription: "Unknown NodeContent type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .title(let text):
|
||
try container.encode("title", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
case .heading(let level, let text):
|
||
try container.encode("heading", forKey: .node_type)
|
||
try container.encode(level, forKey: .level)
|
||
try container.encode(text, forKey: .text)
|
||
case .paragraph(let text):
|
||
try container.encode("paragraph", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
case .list(let ordered):
|
||
try container.encode("list", forKey: .node_type)
|
||
try container.encode(ordered, forKey: .ordered)
|
||
case .listItem(let text):
|
||
try container.encode("list_item", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
case .table(let grid):
|
||
try container.encode("table", forKey: .node_type)
|
||
try container.encode(grid, forKey: .grid)
|
||
case .image(let description, let imageIndex, let src):
|
||
try container.encode("image", forKey: .node_type)
|
||
try container.encodeIfPresent(description, forKey: .description)
|
||
try container.encodeIfPresent(imageIndex, forKey: .imageIndex)
|
||
try container.encodeIfPresent(src, forKey: .src)
|
||
case .code(let text, let language):
|
||
try container.encode("code", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
try container.encodeIfPresent(language, forKey: .language)
|
||
case .quote:
|
||
try container.encode("quote", forKey: .node_type)
|
||
case .formula(let text):
|
||
try container.encode("formula", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
case .footnote(let text):
|
||
try container.encode("footnote", forKey: .node_type)
|
||
try container.encode(text, forKey: .text)
|
||
case .group(let label, let headingLevel, let headingText):
|
||
try container.encode("group", forKey: .node_type)
|
||
try container.encodeIfPresent(label, forKey: .label)
|
||
try container.encodeIfPresent(headingLevel, forKey: .headingLevel)
|
||
try container.encodeIfPresent(headingText, forKey: .headingText)
|
||
case .pageBreak:
|
||
try container.encode("page_break", forKey: .node_type)
|
||
case .slide(let number, let title):
|
||
try container.encode("slide", forKey: .node_type)
|
||
try container.encode(number, forKey: .number)
|
||
try container.encodeIfPresent(title, forKey: .title)
|
||
case .definitionList:
|
||
try container.encode("definition_list", forKey: .node_type)
|
||
case .definitionItem(let term, let definition):
|
||
try container.encode("definition_item", forKey: .node_type)
|
||
try container.encode(term, forKey: .term)
|
||
try container.encode(definition, forKey: .definition)
|
||
case .citation(let key, let text):
|
||
try container.encode("citation", forKey: .node_type)
|
||
try container.encode(key, forKey: .key)
|
||
try container.encode(text, forKey: .text)
|
||
case .admonition(let kind, let title):
|
||
try container.encode("admonition", forKey: .node_type)
|
||
try container.encode(kind, forKey: .kind)
|
||
try container.encodeIfPresent(title, forKey: .title)
|
||
case .rawBlock(let format, let content):
|
||
try container.encode("raw_block", forKey: .node_type)
|
||
try container.encode(format, forKey: .format)
|
||
try container.encode(content, forKey: .content)
|
||
case .metadataBlock(let entries):
|
||
try container.encode("metadata_block", forKey: .node_type)
|
||
try container.encode(entries, forKey: .entries)
|
||
}
|
||
}
|
||
}
|
||
extension NodeContent {
|
||
func intoRust() throws -> RustBridge.NodeContent {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.nodeContentFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Types of inline text annotations.
|
||
public enum AnnotationKind: Codable, Sendable, Hashable {
|
||
case bold
|
||
case italic
|
||
case underline
|
||
case strikethrough
|
||
case code
|
||
case `subscript`
|
||
case superscript
|
||
case link(url: String, title: String?)
|
||
/// Highlighted text (PDF highlights, HTML `<mark>`).
|
||
case highlight
|
||
/// Text color (CSS-compatible value, e.g. "#ff0000", "red").
|
||
case color(value: String)
|
||
/// Font size with units (e.g. "12pt", "1.2em", "16px").
|
||
case fontSize(value: String)
|
||
/// Extensible annotation for format-specific styling.
|
||
case custom(name: String, value: String?)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case annotation_type
|
||
case name
|
||
case title
|
||
case url
|
||
case value
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .annotation_type)
|
||
switch type {
|
||
case "bold":
|
||
self = .bold
|
||
case "italic":
|
||
self = .italic
|
||
case "underline":
|
||
self = .underline
|
||
case "strikethrough":
|
||
self = .strikethrough
|
||
case "code":
|
||
self = .code
|
||
case "subscript":
|
||
self = .`subscript`
|
||
case "superscript":
|
||
self = .superscript
|
||
case "link":
|
||
self = .link(url: try container.decode(String.self, forKey: .url), title: try container.decodeIfPresent(String.self, forKey: .title))
|
||
case "highlight":
|
||
self = .highlight
|
||
case "color":
|
||
self = .color(value: try container.decode(String.self, forKey: .value))
|
||
case "font_size":
|
||
self = .fontSize(value: try container.decode(String.self, forKey: .value))
|
||
case "custom":
|
||
self = .custom(name: try container.decode(String.self, forKey: .name), value: try container.decodeIfPresent(String.self, forKey: .value))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .annotation_type,
|
||
in: container,
|
||
debugDescription: "Unknown AnnotationKind type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .bold:
|
||
try container.encode("bold", forKey: .annotation_type)
|
||
case .italic:
|
||
try container.encode("italic", forKey: .annotation_type)
|
||
case .underline:
|
||
try container.encode("underline", forKey: .annotation_type)
|
||
case .strikethrough:
|
||
try container.encode("strikethrough", forKey: .annotation_type)
|
||
case .code:
|
||
try container.encode("code", forKey: .annotation_type)
|
||
case .`subscript`:
|
||
try container.encode("subscript", forKey: .annotation_type)
|
||
case .superscript:
|
||
try container.encode("superscript", forKey: .annotation_type)
|
||
case .link(let url, let title):
|
||
try container.encode("link", forKey: .annotation_type)
|
||
try container.encode(url, forKey: .url)
|
||
try container.encodeIfPresent(title, forKey: .title)
|
||
case .highlight:
|
||
try container.encode("highlight", forKey: .annotation_type)
|
||
case .color(let value):
|
||
try container.encode("color", forKey: .annotation_type)
|
||
try container.encode(value, forKey: .value)
|
||
case .fontSize(let value):
|
||
try container.encode("font_size", forKey: .annotation_type)
|
||
try container.encode(value, forKey: .value)
|
||
case .custom(let name, let value):
|
||
try container.encode("custom", forKey: .annotation_type)
|
||
try container.encode(name, forKey: .name)
|
||
try container.encodeIfPresent(value, forKey: .value)
|
||
}
|
||
}
|
||
}
|
||
extension AnnotationKind {
|
||
func intoRust() throws -> RustBridge.AnnotationKind {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.annotationKindFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// How the extracted text was produced.
|
||
public enum ExtractionMethod: String, Codable, Sendable, Hashable {
|
||
case native
|
||
case ocr
|
||
case mixed
|
||
}
|
||
extension ExtractionMethod {
|
||
func intoRust() throws -> RustBridge.ExtractionMethod {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.extractionMethodFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Semantic structural classification of a text chunk.
|
||
///
|
||
/// Assigned by the heuristic classifier in `chunking::classifier`.
|
||
/// Defaults to `Unknown` when no rule matches.
|
||
/// Designed to be extended in future versions without breaking changes.
|
||
public enum ChunkType: String, Codable, Sendable, Hashable {
|
||
/// Section heading or document title.
|
||
case heading
|
||
/// Party list: names, addresses, and signatories.
|
||
case partyList = "party_list"
|
||
/// Definition clause ("X means…", "X shall mean…").
|
||
case definitions
|
||
/// Operative clause containing legal/contractual action verbs.
|
||
case operativeClause = "operative_clause"
|
||
/// Signature block with signatures, names, and dates.
|
||
case signatureBlock = "signature_block"
|
||
/// Schedule, annex, appendix, or exhibit section.
|
||
case schedule
|
||
/// Table-like content with aligned columns or repeated patterns.
|
||
case tableLike = "table_like"
|
||
/// Mathematical formula or equation.
|
||
case formula
|
||
/// Code block or preformatted content.
|
||
case codeBlock = "code_block"
|
||
/// Embedded or referenced image content.
|
||
case image
|
||
/// Organizational chart or hierarchy diagram.
|
||
case orgChart = "org_chart"
|
||
/// Diagram, figure, or visual illustration.
|
||
case diagram
|
||
/// Unclassified or mixed content.
|
||
case unknown
|
||
}
|
||
extension ChunkType {
|
||
func intoRust() throws -> RustBridge.ChunkType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.chunkTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Heuristic classification of what an image likely depicts.
|
||
public enum ImageKind: String, Codable, Sendable, Hashable {
|
||
/// Photographic image (natural scene, photograph)
|
||
case photograph
|
||
/// Technical or schematic diagram
|
||
case diagram
|
||
/// Chart, graph, or plot
|
||
case chart
|
||
/// Freehand or technical drawing
|
||
case drawing
|
||
/// Text-heavy image (scanned text, document)
|
||
case textBlock = "text_block"
|
||
/// Decorative element or border
|
||
case decoration
|
||
/// Logo or brand mark
|
||
case logo
|
||
/// Small icon
|
||
case icon
|
||
/// Fragment of a larger tiled image (tile of a technical drawing)
|
||
case tileFragment = "tile_fragment"
|
||
/// Mask or transparency map
|
||
case mask
|
||
/// Full-page render produced during OCR preprocessing; used as a citation thumbnail.
|
||
case pageRaster = "page_raster"
|
||
/// Could not classify with reasonable confidence
|
||
case unknown
|
||
}
|
||
extension ImageKind {
|
||
func intoRust() throws -> RustBridge.ImageKind {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.imageKindFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Result-shape selection for extraction results.
|
||
///
|
||
/// Distinct from `OutputFormat` (which controls rendering — Plain, Markdown,
|
||
/// HTML, etc.). `ResultFormat` controls the *shape* of the result: a unified content
|
||
/// blob vs. an element-based decomposition.
|
||
public enum ResultFormat: String, Codable, Sendable, Hashable {
|
||
/// Unified format with all content in `content` field
|
||
case unified
|
||
/// Element-based format with semantic element extraction
|
||
case elementBased = "element_based"
|
||
}
|
||
extension ResultFormat {
|
||
func intoRust() throws -> RustBridge.ResultFormat {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.resultFormatFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Semantic element type classification.
|
||
///
|
||
/// Categorizes text content into semantic units for downstream processing.
|
||
/// Supports the element types commonly found in Unstructured documents.
|
||
public enum ElementType: String, Codable, Sendable, Hashable {
|
||
/// Document title
|
||
case title
|
||
/// Main narrative text body
|
||
case narrativeText = "narrative_text"
|
||
/// Section heading
|
||
case heading
|
||
/// List item (bullet, numbered, etc.)
|
||
case listItem = "list_item"
|
||
/// Table element
|
||
case table
|
||
/// Image element
|
||
case image
|
||
/// Page break marker
|
||
case pageBreak = "page_break"
|
||
/// Code block
|
||
case codeBlock = "code_block"
|
||
/// Block quote
|
||
case blockQuote = "block_quote"
|
||
/// Footer text
|
||
case footer
|
||
/// Header text
|
||
case header
|
||
}
|
||
extension ElementType {
|
||
func intoRust() throws -> RustBridge.ElementType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.elementTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Format-specific metadata (discriminated union).
|
||
///
|
||
/// Only one format type can exist per extraction result. This provides
|
||
/// type-safe, clean metadata without nested optionals.
|
||
public typealias FormatMetadata = RustBridge.FormatMetadata
|
||
|
||
/// Text direction enumeration for HTML documents.
|
||
public enum TextDirection: String, Codable, Sendable, Hashable {
|
||
/// Left-to-right text direction
|
||
case leftToRight = "ltr"
|
||
/// Right-to-left text direction
|
||
case rightToLeft = "rtl"
|
||
/// Automatic text direction detection
|
||
case auto
|
||
}
|
||
extension TextDirection {
|
||
func intoRust() throws -> RustBridge.TextDirection {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.textDirectionFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Link type classification.
|
||
public enum LinkType: String, Codable, Sendable, Hashable {
|
||
/// Anchor link (#section)
|
||
case anchor
|
||
/// Internal link (same domain)
|
||
case `internal`
|
||
/// External link (different domain)
|
||
case external
|
||
/// Email link (mailto:)
|
||
case email
|
||
/// Phone link (tel:)
|
||
case phone
|
||
/// Other link type
|
||
case other
|
||
}
|
||
extension LinkType {
|
||
func intoRust() throws -> RustBridge.LinkType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.linkTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Image type classification.
|
||
public enum ImageType: String, Codable, Sendable, Hashable {
|
||
/// Data URI image
|
||
case dataUri = "data-uri"
|
||
/// Inline SVG
|
||
case inlineSvg = "inline-svg"
|
||
/// External image URL
|
||
case external
|
||
/// Relative path image
|
||
case relative
|
||
}
|
||
extension ImageType {
|
||
func intoRust() throws -> RustBridge.ImageType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.imageTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Structured data type classification.
|
||
public enum StructuredDataType: String, Codable, Sendable, Hashable {
|
||
/// JSON-LD structured data
|
||
case jsonLd = "json-ld"
|
||
/// Microdata
|
||
case microdata
|
||
/// RDFa
|
||
case rdFa = "rdfa"
|
||
}
|
||
extension StructuredDataType {
|
||
func intoRust() throws -> RustBridge.StructuredDataType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.structuredDataTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Bounding geometry for an OCR element.
|
||
///
|
||
/// Supports both axis-aligned rectangles (from Tesseract) and 4-point quadrilaterals
|
||
/// (from PaddleOCR and rotated text detection).
|
||
public enum OcrBoundingGeometry: Codable, Sendable, Hashable {
|
||
/// Axis-aligned bounding box (typical for Tesseract output).
|
||
case rectangle(left: UInt32, top: UInt32, width: UInt32, height: UInt32)
|
||
/// 4-point quadrilateral for rotated/skewed text (PaddleOCR).
|
||
///
|
||
/// Points are in clockwise order starting from top-left:
|
||
/// `[top_left, top_right, bottom_right, bottom_left]`
|
||
case quadrilateral(points: String)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case type
|
||
case height
|
||
case left
|
||
case points
|
||
case top
|
||
case width
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .type)
|
||
switch type {
|
||
case "rectangle":
|
||
self = .rectangle(left: try container.decode(UInt32.self, forKey: .left), top: try container.decode(UInt32.self, forKey: .top), width: try container.decode(UInt32.self, forKey: .width), height: try container.decode(UInt32.self, forKey: .height))
|
||
case "quadrilateral":
|
||
self = .quadrilateral(points: try container.decode(String.self, forKey: .points))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .type,
|
||
in: container,
|
||
debugDescription: "Unknown OcrBoundingGeometry type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .rectangle(let left, let top, let width, let height):
|
||
try container.encode("rectangle", forKey: .type)
|
||
try container.encode(left, forKey: .left)
|
||
try container.encode(top, forKey: .top)
|
||
try container.encode(width, forKey: .width)
|
||
try container.encode(height, forKey: .height)
|
||
case .quadrilateral(let points):
|
||
try container.encode("quadrilateral", forKey: .type)
|
||
try container.encode(points, forKey: .points)
|
||
}
|
||
}
|
||
}
|
||
extension OcrBoundingGeometry {
|
||
func intoRust() throws -> RustBridge.OcrBoundingGeometry {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.ocrBoundingGeometryFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Hierarchical level of an OCR element.
|
||
///
|
||
/// Maps to Tesseract's page segmentation hierarchy and provides
|
||
/// equivalent semantics for PaddleOCR.
|
||
public enum OcrElementLevel: String, Codable, Sendable, Hashable {
|
||
/// Individual word
|
||
case word
|
||
/// Line of text (default for PaddleOCR)
|
||
case line
|
||
/// Paragraph or text block
|
||
case block
|
||
/// Page-level element
|
||
case page
|
||
}
|
||
extension OcrElementLevel {
|
||
func intoRust() throws -> RustBridge.OcrElementLevel {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.ocrElementLevelFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Type of paginated unit in a document.
|
||
///
|
||
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
||
public enum PageUnitType: String, Codable, Sendable, Hashable {
|
||
/// Standard document pages (PDF, DOCX, images)
|
||
case page
|
||
/// Presentation slides (PPTX, ODP)
|
||
case slide
|
||
/// Spreadsheet sheets (XLSX, ODS)
|
||
case sheet
|
||
}
|
||
extension PageUnitType {
|
||
func intoRust() throws -> RustBridge.PageUnitType {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.pageUnitTypeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// A single line in a unified-diff hunk.
|
||
///
|
||
/// Defined here (rather than only in `crate::diff`) so `RevisionDelta` can
|
||
/// reference it unconditionally, without requiring the `diff` Cargo feature.
|
||
/// `crate::diff` re-exports this type verbatim.
|
||
public enum DiffLine: Codable, Sendable, Hashable {
|
||
/// Unchanged context line.
|
||
case context(field0: String)
|
||
/// Line added in the "after" version.
|
||
case added(field0: String)
|
||
/// Line removed from the "before" version.
|
||
case removed(field0: String)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case kind
|
||
case field0 = "_0"
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .kind)
|
||
switch type {
|
||
case "context":
|
||
self = .context(field0: try container.decode(String.self, forKey: .field0))
|
||
case "added":
|
||
self = .added(field0: try container.decode(String.self, forKey: .field0))
|
||
case "removed":
|
||
self = .removed(field0: try container.decode(String.self, forKey: .field0))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .kind,
|
||
in: container,
|
||
debugDescription: "Unknown DiffLine type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .context(let field0):
|
||
try container.encode("context", forKey: .kind)
|
||
try container.encode(field0, forKey: .field0)
|
||
case .added(let field0):
|
||
try container.encode("added", forKey: .kind)
|
||
try container.encode(field0, forKey: .field0)
|
||
case .removed(let field0):
|
||
try container.encode("removed", forKey: .kind)
|
||
try container.encode(field0, forKey: .field0)
|
||
}
|
||
}
|
||
}
|
||
extension DiffLine {
|
||
func intoRust() throws -> RustBridge.DiffLine {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.diffLineFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Semantic classification of a tracked change.
|
||
public enum RevisionKind: String, Codable, Sendable, Hashable {
|
||
/// Text or content was inserted.
|
||
case insertion
|
||
/// Text or content was deleted.
|
||
case deletion
|
||
/// Run-level formatting (font, size, colour, …) was changed.
|
||
case formatChange = "format_change"
|
||
/// A reviewer comment or annotation.
|
||
case comment
|
||
}
|
||
extension RevisionKind {
|
||
func intoRust() throws -> RustBridge.RevisionKind {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.revisionKindFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Best-effort document location for a revision.
|
||
public enum RevisionAnchor: Codable, Sendable, Hashable {
|
||
/// Body paragraph, identified by its zero-based index in the document flow.
|
||
case paragraph(index: UInt)
|
||
/// Cell inside a table.
|
||
case tableCell(row: UInt, col: UInt, tableIndex: UInt)
|
||
/// Page, identified by its zero-based index.
|
||
case page(index: UInt)
|
||
/// Presentation slide, identified by its zero-based index.
|
||
case slide(index: UInt)
|
||
/// Spreadsheet cell or range, identified by sheet index and optional name.
|
||
case sheet(index: UInt, name: String?)
|
||
|
||
private enum CodingKeys: String, CodingKey {
|
||
case type
|
||
case col
|
||
case index
|
||
case name
|
||
case row
|
||
case tableIndex = "table_index"
|
||
}
|
||
|
||
public init(from decoder: Decoder) throws {
|
||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||
let type = try container.decode(String.self, forKey: .type)
|
||
switch type {
|
||
case "paragraph":
|
||
self = .paragraph(index: try container.decode(UInt.self, forKey: .index))
|
||
case "table_cell":
|
||
self = .tableCell(row: try container.decode(UInt.self, forKey: .row), col: try container.decode(UInt.self, forKey: .col), tableIndex: try container.decode(UInt.self, forKey: .tableIndex))
|
||
case "page":
|
||
self = .page(index: try container.decode(UInt.self, forKey: .index))
|
||
case "slide":
|
||
self = .slide(index: try container.decode(UInt.self, forKey: .index))
|
||
case "sheet":
|
||
self = .sheet(index: try container.decode(UInt.self, forKey: .index), name: try container.decodeIfPresent(String.self, forKey: .name))
|
||
default:
|
||
throw DecodingError.dataCorruptedError(
|
||
forKey: .type,
|
||
in: container,
|
||
debugDescription: "Unknown RevisionAnchor type: \(type)"
|
||
)
|
||
}
|
||
}
|
||
|
||
public func encode(to encoder: Encoder) throws {
|
||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||
switch self {
|
||
case .paragraph(let index):
|
||
try container.encode("paragraph", forKey: .type)
|
||
try container.encode(index, forKey: .index)
|
||
case .tableCell(let row, let col, let tableIndex):
|
||
try container.encode("table_cell", forKey: .type)
|
||
try container.encode(row, forKey: .row)
|
||
try container.encode(col, forKey: .col)
|
||
try container.encode(tableIndex, forKey: .tableIndex)
|
||
case .page(let index):
|
||
try container.encode("page", forKey: .type)
|
||
try container.encode(index, forKey: .index)
|
||
case .slide(let index):
|
||
try container.encode("slide", forKey: .type)
|
||
try container.encode(index, forKey: .index)
|
||
case .sheet(let index, let name):
|
||
try container.encode("sheet", forKey: .type)
|
||
try container.encode(index, forKey: .index)
|
||
try container.encodeIfPresent(name, forKey: .name)
|
||
}
|
||
}
|
||
}
|
||
extension RevisionAnchor {
|
||
func intoRust() throws -> RustBridge.RevisionAnchor {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.revisionAnchorFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Semantic classification of an extracted URI.
|
||
public enum UriKind: String, Codable, Sendable, Hashable {
|
||
/// A clickable hyperlink (web URL, file link).
|
||
case hyperlink
|
||
/// An image or media resource reference.
|
||
case image
|
||
/// An internal anchor or cross-reference target.
|
||
case anchor
|
||
/// A citation or bibliographic reference (DOI, academic ref).
|
||
case citation
|
||
/// A general reference (e.g. `\ref{}` in LaTeX, `:ref:` in RST).
|
||
case reference
|
||
/// An email address (`mailto:` link or bare email).
|
||
case email
|
||
}
|
||
extension UriKind {
|
||
func intoRust() throws -> RustBridge.UriKind {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.uriKindFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Keyword algorithm selection.
|
||
public enum KeywordAlgorithm: String, Codable, Sendable, Hashable {
|
||
/// YAKE (Yet Another Keyword Extractor) - statistical approach
|
||
case yake
|
||
/// RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
||
case rake
|
||
}
|
||
extension KeywordAlgorithm {
|
||
func intoRust() throws -> RustBridge.KeywordAlgorithm {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.keywordAlgorithmFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Page Segmentation Mode for Tesseract OCR
|
||
public enum PSMMode: String, Codable, Sendable, Hashable {
|
||
case osdOnly = "OsdOnly"
|
||
case autoOsd = "AutoOsd"
|
||
case autoOnly = "AutoOnly"
|
||
case auto = "Auto"
|
||
case singleColumn = "SingleColumn"
|
||
case singleBlockVertical = "SingleBlockVertical"
|
||
case singleBlock = "SingleBlock"
|
||
case singleLine = "SingleLine"
|
||
case singleWord = "SingleWord"
|
||
case circleWord = "CircleWord"
|
||
case singleChar = "SingleChar"
|
||
}
|
||
extension PSMMode {
|
||
func intoRust() throws -> RustBridge.PSMMode {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.psmModeFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Supported languages in PaddleOCR.
|
||
///
|
||
/// Maps user-friendly language codes to paddle-ocr-rs language identifiers.
|
||
public enum PaddleLanguage: String, Codable, Sendable, Hashable {
|
||
/// English
|
||
case english = "English"
|
||
/// Simplified Chinese
|
||
case chinese = "Chinese"
|
||
/// Japanese
|
||
case japanese = "Japanese"
|
||
/// Korean
|
||
case korean = "Korean"
|
||
/// German
|
||
case german = "German"
|
||
/// French
|
||
case french = "French"
|
||
/// Latin script (covers most European languages)
|
||
case latin = "Latin"
|
||
/// Cyrillic (Russian and related)
|
||
case cyrillic = "Cyrillic"
|
||
/// Traditional Chinese
|
||
case traditionalChinese = "TraditionalChinese"
|
||
/// Thai
|
||
case thai = "Thai"
|
||
/// Greek
|
||
case greek = "Greek"
|
||
/// East Slavic (Russian, Ukrainian, Belarusian)
|
||
case eastSlavic = "EastSlavic"
|
||
/// Arabic (Arabic, Persian, Urdu)
|
||
case arabic = "Arabic"
|
||
/// Devanagari (Hindi, Marathi, Sanskrit, Nepali)
|
||
case devanagari = "Devanagari"
|
||
/// Tamil
|
||
case tamil = "Tamil"
|
||
/// Telugu
|
||
case telugu = "Telugu"
|
||
}
|
||
extension PaddleLanguage {
|
||
func intoRust() throws -> RustBridge.PaddleLanguage {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.paddleLanguageFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// The 17 canonical document layout classes.
|
||
///
|
||
/// All model backends (RT-DETR, YOLO, etc.) map their native class IDs
|
||
/// to this shared set. Models with fewer classes (DocLayNet: 11, PubLayNet: 5)
|
||
/// map to the closest equivalent.
|
||
///
|
||
/// Wire format is snake_case in all serializers (JSON, TOML, YAML).
|
||
public enum LayoutClass: String, Codable, Sendable, Hashable {
|
||
case caption
|
||
case footnote
|
||
case formula
|
||
case listItem = "list_item"
|
||
case pageFooter = "page_footer"
|
||
case pageHeader = "page_header"
|
||
case picture
|
||
case sectionHeader = "section_header"
|
||
case table
|
||
case text
|
||
case title
|
||
case documentIndex = "document_index"
|
||
case code
|
||
case checkboxSelected = "checkbox_selected"
|
||
case checkboxUnselected = "checkbox_unselected"
|
||
case form
|
||
case keyValueRegion = "key_value_region"
|
||
}
|
||
extension LayoutClass {
|
||
func intoRust() throws -> RustBridge.LayoutClass {
|
||
let data = try JSONEncoder().encode(self)
|
||
let json = String(data: data, encoding: .utf8) ?? "null"
|
||
return try RustBridge.layoutClassFromJson(json)
|
||
}
|
||
}
|
||
|
||
/// Main error type for all Kreuzberg operations.
|
||
///
|
||
/// All errors in Kreuzberg use this enum, which preserves error chains
|
||
/// and provides context for debugging.
|
||
///
|
||
/// # Variants
|
||
///
|
||
/// - `Io` - File system and I/O errors (always bubble up)
|
||
/// - `Parsing` - Document parsing errors (corrupt files, unsupported features)
|
||
/// - `Ocr` - OCR processing errors
|
||
/// - `Validation` - Input validation errors (invalid paths, config, parameters)
|
||
/// - `Cache` - Cache operation errors (non-fatal, can be ignored)
|
||
/// - `ImageProcessing` - Image manipulation errors
|
||
/// - `Serialization` - JSON/MessagePack serialization errors
|
||
/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
|
||
/// - `Plugin` - Plugin-specific errors
|
||
/// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
|
||
/// - `UnsupportedFormat` - Unsupported MIME type or file format
|
||
/// - `Other` - Catch-all for uncommon errors
|
||
public enum KreuzbergError: Swift.Error {
|
||
case io(message: String, field0: String)
|
||
case parsing(message: String)
|
||
case ocr(message: String)
|
||
case validation(message: String)
|
||
case cache(message: String)
|
||
case imageProcessing(message: String)
|
||
case serialization(message: String)
|
||
case missingDependency(message: String, field0: String)
|
||
case plugin(message: String, pluginName: String)
|
||
case lockPoisoned(message: String, field0: String)
|
||
case unsupportedFormat(message: String, field0: String)
|
||
case embedding(message: String)
|
||
case timeout(message: String, elapsedMs: UInt64, limitMs: UInt64)
|
||
case cancelled
|
||
case security(message: String)
|
||
case other(message: String, field0: String)
|
||
}
|
||
|
||
// MARK: - Convenience Wrapper Functions
|
||
// These wrappers bridge String / [UInt8] inputs to RustBridge's
|
||
// RustVec<UInt8> requirement. The config parameter must be a fully
|
||
// constructed opaque type (built via the generated initializer);
|
||
// JSON-config decoding is not available because swift-bridge opaque
|
||
// proxy classes are not Codable Swift structs.
|
||
|
||
/// Converts a Swift `[UInt8]` array to a `RustVec<UInt8>` by pushing each byte.
|
||
/// swift-bridge's `RustVec<T>` runtime only exposes `init()` and `push(value:)`;
|
||
/// no array-initializer shorthand exists.
|
||
private func makeByteVec(_ bytes: [UInt8]) -> RustVec<UInt8> {
|
||
let vec = RustVec<UInt8>()
|
||
for b in bytes { vec.push(value: b) }
|
||
return vec
|
||
}
|
||
|
||
/// Convenience overload: accepts a UTF-8 `String` and converts it to bytes.
|
||
public func extractBytes(
|
||
content: String,
|
||
mimeType: String
|
||
,
|
||
config: ExtractionConfig
|
||
|
||
) throws -> ExtractionResult {
|
||
return try extractBytesSync(makeByteVec(Array(content.utf8)), mimeType
|
||
, config
|
||
)
|
||
}
|
||
/// Convenience overload: accepts a `[UInt8]` byte array.
|
||
public func extractBytes(
|
||
content: [UInt8],
|
||
mimeType: String
|
||
,
|
||
config: ExtractionConfig
|
||
|
||
) throws -> ExtractionResult {
|
||
return try extractBytesSync(makeByteVec(content), mimeType
|
||
, config
|
||
)
|
||
}
|
||
/// Convenience overload: accepts a file path as a `String`.
|
||
public func extractFile(
|
||
path: String,
|
||
mimeType: String? = nil
|
||
,
|
||
config: ExtractionConfig
|
||
|
||
) throws -> ExtractionResult {
|
||
return try extractFileSync(path, mimeType
|
||
, config
|
||
)
|
||
}
|
||
// MARK: - JSON-String Convenience Overloads
|
||
// These overloads accept JSON-encoded config parameters and decode them automatically.
|
||
// Enables e2e tests to pass JSON strings directly without typed config construction.
|
||
|
||
/// Resolves a string argument as either a file path or literal UTF-8 content.
|
||
/// Searches: current working directory, ALEF_TEST_DOCUMENTS_DIR env var,
|
||
/// and ancestor `test_documents/` or `fixtures/` directories (up to 16 levels).
|
||
/// If no file is found, treats the string as UTF-8 content and returns its bytes.
|
||
private func _loadBytesFromPathOrUtf8(_ pathOrContent: String) throws -> [UInt8] {
|
||
let fm = FileManager.default
|
||
var roots: [String] = [fm.currentDirectoryPath]
|
||
if let envRoot = ProcessInfo.processInfo.environment["ALEF_TEST_DOCUMENTS_DIR"] {
|
||
roots.append(envRoot)
|
||
}
|
||
var walker = URL(fileURLWithPath: fm.currentDirectoryPath)
|
||
for _ in 0..<16 {
|
||
roots.append(walker.appendingPathComponent("test_documents").path)
|
||
roots.append(walker.appendingPathComponent("fixtures").path)
|
||
let parent = walker.deletingLastPathComponent()
|
||
if parent.path == walker.path { break }
|
||
walker = parent
|
||
}
|
||
let candidates = [pathOrContent] + roots.map { ($0 as NSString).appendingPathComponent(pathOrContent) }
|
||
for path in candidates {
|
||
if fm.fileExists(atPath: path), let data = try? Data(contentsOf: URL(fileURLWithPath: path)) {
|
||
return [UInt8](data)
|
||
}
|
||
}
|
||
return [UInt8](pathOrContent.utf8)
|
||
}
|
||
|
||
public func extractBytes(_ content: [UInt8], _ mimeType: String, _ configJson: String) async throws -> ExtractionResult {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try await extractBytes(content: content, mimeType: mimeType, config: config)
|
||
}
|
||
|
||
public func extractFile(_ path: String, _ mimeType: String?, _ configJson: String) async throws -> ExtractionResult {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try await extractFile(path: path, mimeType: mimeType, config: config)
|
||
}
|
||
|
||
public func extractFileSync(_ path: String, _ mimeType: String?, _ configJson: String) throws -> ExtractionResult {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try extractFileSync(path: path, mimeType: mimeType, config: config)
|
||
}
|
||
|
||
public func extractBytesSync(_ content: [UInt8], _ mimeType: String, _ configJson: String) throws -> ExtractionResult {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try extractBytesSync(content: content, mimeType: mimeType, config: config)
|
||
}
|
||
|
||
public func batchExtractFilesSync(_ items: [BatchFileItem], _ configJson: String) throws -> [ExtractionResult] {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try batchExtractFilesSync(items: items, config: config)
|
||
}
|
||
|
||
public func batchExtractBytesSync(_ items: [BatchBytesItem], _ configJson: String) throws -> [ExtractionResult] {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try batchExtractBytesSync(items: items, config: config)
|
||
}
|
||
|
||
public func batchExtractFiles(_ items: [BatchFileItem], _ configJson: String) async throws -> [ExtractionResult] {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try await batchExtractFiles(items: items, config: config)
|
||
}
|
||
|
||
public func batchExtractBytes(_ items: [BatchBytesItem], _ configJson: String) async throws -> [ExtractionResult] {
|
||
let config = try extractionConfigFromJson(configJson)
|
||
return try await batchExtractBytes(items: items, config: config)
|
||
}
|
||
|
||
public func compare(_ configJson: String, _ b: ExtractionResult, _ opts: DiffOptions) throws -> ExtractionDiff {
|
||
let config = try extractionResultFromJson(configJson)
|
||
return try compare(a: config, b: b, opts: opts)
|
||
}
|
||
|
||
public func compare(_ a: ExtractionResult, _ configJson: String, _ opts: DiffOptions) throws -> ExtractionDiff {
|
||
let config = try extractionResultFromJson(configJson)
|
||
return try compare(a: a, b: config, opts: opts)
|
||
}
|
||
|
||
public func compare(_ a: ExtractionResult, _ b: ExtractionResult, _ configJson: String) throws -> ExtractionDiff {
|
||
let config = try diffOptionsFromJson(configJson)
|
||
return try compare(a: a, b: b, opts: config)
|
||
}
|
||
|
||
public func embedTextsAsync(_ texts: [String], _ configJson: String) async throws -> [[Float]] {
|
||
let config = try embeddingConfigFromJson(configJson)
|
||
return try await embedTextsAsync(texts: texts, config: config)
|
||
}
|
||
|
||
public func embedTexts(_ texts: [String], _ configJson: String) throws -> [[Float]] {
|
||
let config = try embeddingConfigFromJson(configJson)
|
||
return try embedTexts(texts: texts, config: config)
|
||
}
|
||
|
||
// MARK: - From-JSON Helpers
|
||
// Public helpers that decode JSON into first-class Swift types.
|
||
// First-class struct types (Codable) use JSONDecoder directly.
|
||
// Opaque RustBridge types forward to RustBridge.
|
||
|
||
public func cacheStatsFromJson(_ json: String) throws -> CacheStats {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CacheStats.self, from: data)
|
||
}
|
||
|
||
public func accelerationConfigFromJson(_ json: String) throws -> AccelerationConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(AccelerationConfig.self, from: data)
|
||
}
|
||
|
||
public func contentFilterConfigFromJson(_ json: String) throws -> ContentFilterConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ContentFilterConfig.self, from: data)
|
||
}
|
||
|
||
public func emailConfigFromJson(_ json: String) throws -> EmailConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(EmailConfig.self, from: data)
|
||
}
|
||
|
||
public func extractionConfigFromJson(_ json: String) throws -> ExtractionConfig {
|
||
return try RustBridge.extractionConfigFromJson(json)
|
||
}
|
||
|
||
public func fileExtractionConfigFromJson(_ json: String) throws -> FileExtractionConfig {
|
||
return try RustBridge.fileExtractionConfigFromJson(json)
|
||
}
|
||
|
||
public func batchBytesItemFromJson(_ json: String) throws -> BatchBytesItem {
|
||
return try RustBridge.batchBytesItemFromJson(json)
|
||
}
|
||
|
||
public func batchFileItemFromJson(_ json: String) throws -> BatchFileItem {
|
||
return try RustBridge.batchFileItemFromJson(json)
|
||
}
|
||
|
||
public func imageExtractionConfigFromJson(_ json: String) throws -> ImageExtractionConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImageExtractionConfig.self, from: data)
|
||
}
|
||
|
||
public func tokenReductionOptionsFromJson(_ json: String) throws -> TokenReductionOptions {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TokenReductionOptions.self, from: data)
|
||
}
|
||
|
||
public func languageDetectionConfigFromJson(_ json: String) throws -> LanguageDetectionConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LanguageDetectionConfig.self, from: data)
|
||
}
|
||
|
||
public func htmlOutputConfigFromJson(_ json: String) throws -> HtmlOutputConfig {
|
||
return try RustBridge.htmlOutputConfigFromJson(json)
|
||
}
|
||
|
||
public func layoutDetectionConfigFromJson(_ json: String) throws -> LayoutDetectionConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LayoutDetectionConfig.self, from: data)
|
||
}
|
||
|
||
public func llmConfigFromJson(_ json: String) throws -> LlmConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LlmConfig.self, from: data)
|
||
}
|
||
|
||
public func structuredExtractionConfigFromJson(_ json: String) throws -> StructuredExtractionConfig {
|
||
return try RustBridge.structuredExtractionConfigFromJson(json)
|
||
}
|
||
|
||
public func ocrQualityThresholdsFromJson(_ json: String) throws -> OcrQualityThresholds {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrQualityThresholds.self, from: data)
|
||
}
|
||
|
||
public func ocrPipelineStageFromJson(_ json: String) throws -> OcrPipelineStage {
|
||
return try RustBridge.ocrPipelineStageFromJson(json)
|
||
}
|
||
|
||
public func ocrPipelineConfigFromJson(_ json: String) throws -> OcrPipelineConfig {
|
||
return try RustBridge.ocrPipelineConfigFromJson(json)
|
||
}
|
||
|
||
public func ocrConfigFromJson(_ json: String) throws -> OcrConfig {
|
||
return try RustBridge.ocrConfigFromJson(json)
|
||
}
|
||
|
||
public func pageConfigFromJson(_ json: String) throws -> PageConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageConfig.self, from: data)
|
||
}
|
||
|
||
public func pdfConfigFromJson(_ json: String) throws -> PdfConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PdfConfig.self, from: data)
|
||
}
|
||
|
||
public func hierarchyConfigFromJson(_ json: String) throws -> HierarchyConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HierarchyConfig.self, from: data)
|
||
}
|
||
|
||
public func postProcessorConfigFromJson(_ json: String) throws -> PostProcessorConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PostProcessorConfig.self, from: data)
|
||
}
|
||
|
||
public func chunkingConfigFromJson(_ json: String) throws -> ChunkingConfig {
|
||
return try RustBridge.chunkingConfigFromJson(json)
|
||
}
|
||
|
||
public func embeddingConfigFromJson(_ json: String) throws -> EmbeddingConfig {
|
||
return try RustBridge.embeddingConfigFromJson(json)
|
||
}
|
||
|
||
public func treeSitterConfigFromJson(_ json: String) throws -> TreeSitterConfig {
|
||
return try RustBridge.treeSitterConfigFromJson(json)
|
||
}
|
||
|
||
public func treeSitterProcessConfigFromJson(_ json: String) throws -> TreeSitterProcessConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TreeSitterProcessConfig.self, from: data)
|
||
}
|
||
|
||
public func supportedFormatFromJson(_ json: String) throws -> SupportedFormat {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(SupportedFormat.self, from: data)
|
||
}
|
||
|
||
public func serverConfigFromJson(_ json: String) throws -> ServerConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ServerConfig.self, from: data)
|
||
}
|
||
|
||
public func structuredDataResultFromJson(_ json: String) throws -> StructuredDataResult {
|
||
return try RustBridge.structuredDataResultFromJson(json)
|
||
}
|
||
|
||
public func docxAppPropertiesFromJson(_ json: String) throws -> DocxAppProperties {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DocxAppProperties.self, from: data)
|
||
}
|
||
|
||
public func xlsxAppPropertiesFromJson(_ json: String) throws -> XlsxAppProperties {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(XlsxAppProperties.self, from: data)
|
||
}
|
||
|
||
public func pptxAppPropertiesFromJson(_ json: String) throws -> PptxAppProperties {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PptxAppProperties.self, from: data)
|
||
}
|
||
|
||
public func corePropertiesFromJson(_ json: String) throws -> CoreProperties {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CoreProperties.self, from: data)
|
||
}
|
||
|
||
public func securityLimitsFromJson(_ json: String) throws -> SecurityLimits {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(SecurityLimits.self, from: data)
|
||
}
|
||
|
||
public func tokenReductionConfigFromJson(_ json: String) throws -> TokenReductionConfig {
|
||
return try RustBridge.tokenReductionConfigFromJson(json)
|
||
}
|
||
|
||
public func pdfAnnotationFromJson(_ json: String) throws -> PdfAnnotation {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PdfAnnotation.self, from: data)
|
||
}
|
||
|
||
public func djotContentFromJson(_ json: String) throws -> DjotContent {
|
||
return try RustBridge.djotContentFromJson(json)
|
||
}
|
||
|
||
public func formattedBlockFromJson(_ json: String) throws -> FormattedBlock {
|
||
return try RustBridge.formattedBlockFromJson(json)
|
||
}
|
||
|
||
public func inlineElementFromJson(_ json: String) throws -> InlineElement {
|
||
return try RustBridge.inlineElementFromJson(json)
|
||
}
|
||
|
||
public func djotImageFromJson(_ json: String) throws -> DjotImage {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DjotImage.self, from: data)
|
||
}
|
||
|
||
public func djotLinkFromJson(_ json: String) throws -> DjotLink {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DjotLink.self, from: data)
|
||
}
|
||
|
||
public func footnoteFromJson(_ json: String) throws -> Footnote {
|
||
return try RustBridge.footnoteFromJson(json)
|
||
}
|
||
|
||
public func documentStructureFromJson(_ json: String) throws -> DocumentStructure {
|
||
return try RustBridge.documentStructureFromJson(json)
|
||
}
|
||
|
||
public func documentRelationshipFromJson(_ json: String) throws -> DocumentRelationship {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DocumentRelationship.self, from: data)
|
||
}
|
||
|
||
public func documentNodeFromJson(_ json: String) throws -> DocumentNode {
|
||
return try RustBridge.documentNodeFromJson(json)
|
||
}
|
||
|
||
public func tableGridFromJson(_ json: String) throws -> TableGrid {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TableGrid.self, from: data)
|
||
}
|
||
|
||
public func gridCellFromJson(_ json: String) throws -> GridCell {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(GridCell.self, from: data)
|
||
}
|
||
|
||
public func textAnnotationFromJson(_ json: String) throws -> TextAnnotation {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TextAnnotation.self, from: data)
|
||
}
|
||
|
||
public func extractionResultFromJson(_ json: String) throws -> ExtractionResult {
|
||
return try RustBridge.extractionResultFromJson(json)
|
||
}
|
||
|
||
public func archiveEntryFromJson(_ json: String) throws -> ArchiveEntry {
|
||
return try RustBridge.archiveEntryFromJson(json)
|
||
}
|
||
|
||
public func processingWarningFromJson(_ json: String) throws -> ProcessingWarning {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ProcessingWarning.self, from: data)
|
||
}
|
||
|
||
public func llmUsageFromJson(_ json: String) throws -> LlmUsage {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LlmUsage.self, from: data)
|
||
}
|
||
|
||
public func chunkFromJson(_ json: String) throws -> Chunk {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(Chunk.self, from: data)
|
||
}
|
||
|
||
public func headingContextFromJson(_ json: String) throws -> HeadingContext {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HeadingContext.self, from: data)
|
||
}
|
||
|
||
public func headingLevelFromJson(_ json: String) throws -> HeadingLevel {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HeadingLevel.self, from: data)
|
||
}
|
||
|
||
public func chunkMetadataFromJson(_ json: String) throws -> ChunkMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ChunkMetadata.self, from: data)
|
||
}
|
||
|
||
public func extractedImageFromJson(_ json: String) throws -> ExtractedImage {
|
||
return try RustBridge.extractedImageFromJson(json)
|
||
}
|
||
|
||
public func boundingBoxFromJson(_ json: String) throws -> BoundingBox {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(BoundingBox.self, from: data)
|
||
}
|
||
|
||
public func elementMetadataFromJson(_ json: String) throws -> ElementMetadata {
|
||
return try RustBridge.elementMetadataFromJson(json)
|
||
}
|
||
|
||
public func elementFromJson(_ json: String) throws -> Element {
|
||
return try RustBridge.elementFromJson(json)
|
||
}
|
||
|
||
public func excelWorkbookFromJson(_ json: String) throws -> ExcelWorkbook {
|
||
return try RustBridge.excelWorkbookFromJson(json)
|
||
}
|
||
|
||
public func excelSheetFromJson(_ json: String) throws -> ExcelSheet {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ExcelSheet.self, from: data)
|
||
}
|
||
|
||
public func xmlExtractionResultFromJson(_ json: String) throws -> XmlExtractionResult {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(XmlExtractionResult.self, from: data)
|
||
}
|
||
|
||
public func textExtractionResultFromJson(_ json: String) throws -> TextExtractionResult {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TextExtractionResult.self, from: data)
|
||
}
|
||
|
||
public func pptxExtractionResultFromJson(_ json: String) throws -> PptxExtractionResult {
|
||
return try RustBridge.pptxExtractionResultFromJson(json)
|
||
}
|
||
|
||
public func emailExtractionResultFromJson(_ json: String) throws -> EmailExtractionResult {
|
||
return try RustBridge.emailExtractionResultFromJson(json)
|
||
}
|
||
|
||
public func emailAttachmentFromJson(_ json: String) throws -> EmailAttachment {
|
||
return try RustBridge.emailAttachmentFromJson(json)
|
||
}
|
||
|
||
public func ocrExtractionResultFromJson(_ json: String) throws -> OcrExtractionResult {
|
||
return try RustBridge.ocrExtractionResultFromJson(json)
|
||
}
|
||
|
||
public func ocrTableFromJson(_ json: String) throws -> OcrTable {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrTable.self, from: data)
|
||
}
|
||
|
||
public func ocrTableBoundingBoxFromJson(_ json: String) throws -> OcrTableBoundingBox {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrTableBoundingBox.self, from: data)
|
||
}
|
||
|
||
public func imagePreprocessingConfigFromJson(_ json: String) throws -> ImagePreprocessingConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImagePreprocessingConfig.self, from: data)
|
||
}
|
||
|
||
public func tesseractConfigFromJson(_ json: String) throws -> TesseractConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TesseractConfig.self, from: data)
|
||
}
|
||
|
||
public func imagePreprocessingMetadataFromJson(_ json: String) throws -> ImagePreprocessingMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImagePreprocessingMetadata.self, from: data)
|
||
}
|
||
|
||
public func metadataFromJson(_ json: String) throws -> Metadata {
|
||
return try RustBridge.metadataFromJson(json)
|
||
}
|
||
|
||
public func excelMetadataFromJson(_ json: String) throws -> ExcelMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ExcelMetadata.self, from: data)
|
||
}
|
||
|
||
public func emailMetadataFromJson(_ json: String) throws -> EmailMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(EmailMetadata.self, from: data)
|
||
}
|
||
|
||
public func archiveMetadataFromJson(_ json: String) throws -> ArchiveMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ArchiveMetadata.self, from: data)
|
||
}
|
||
|
||
public func imageMetadataFromJson(_ json: String) throws -> ImageMetadata {
|
||
return try RustBridge.imageMetadataFromJson(json)
|
||
}
|
||
|
||
public func xmlMetadataFromJson(_ json: String) throws -> XmlMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(XmlMetadata.self, from: data)
|
||
}
|
||
|
||
public func textMetadataFromJson(_ json: String) throws -> TextMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TextMetadata.self, from: data)
|
||
}
|
||
|
||
public func headerMetadataFromJson(_ json: String) throws -> HeaderMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HeaderMetadata.self, from: data)
|
||
}
|
||
|
||
public func linkMetadataFromJson(_ json: String) throws -> LinkMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LinkMetadata.self, from: data)
|
||
}
|
||
|
||
public func imageMetadataTypeFromJson(_ json: String) throws -> ImageMetadataType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImageMetadataType.self, from: data)
|
||
}
|
||
|
||
public func structuredDataFromJson(_ json: String) throws -> StructuredData {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(StructuredData.self, from: data)
|
||
}
|
||
|
||
public func htmlMetadataFromJson(_ json: String) throws -> HtmlMetadata {
|
||
return try RustBridge.htmlMetadataFromJson(json)
|
||
}
|
||
|
||
public func ocrMetadataFromJson(_ json: String) throws -> OcrMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrMetadata.self, from: data)
|
||
}
|
||
|
||
public func errorMetadataFromJson(_ json: String) throws -> ErrorMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ErrorMetadata.self, from: data)
|
||
}
|
||
|
||
public func pptxMetadataFromJson(_ json: String) throws -> PptxMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PptxMetadata.self, from: data)
|
||
}
|
||
|
||
public func docxMetadataFromJson(_ json: String) throws -> DocxMetadata {
|
||
return try RustBridge.docxMetadataFromJson(json)
|
||
}
|
||
|
||
public func csvMetadataFromJson(_ json: String) throws -> CsvMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CsvMetadata.self, from: data)
|
||
}
|
||
|
||
public func bibtexMetadataFromJson(_ json: String) throws -> BibtexMetadata {
|
||
return try RustBridge.bibtexMetadataFromJson(json)
|
||
}
|
||
|
||
public func citationMetadataFromJson(_ json: String) throws -> CitationMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CitationMetadata.self, from: data)
|
||
}
|
||
|
||
public func yearRangeFromJson(_ json: String) throws -> YearRange {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(YearRange.self, from: data)
|
||
}
|
||
|
||
public func fictionBookMetadataFromJson(_ json: String) throws -> FictionBookMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(FictionBookMetadata.self, from: data)
|
||
}
|
||
|
||
public func dbfMetadataFromJson(_ json: String) throws -> DbfMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DbfMetadata.self, from: data)
|
||
}
|
||
|
||
public func dbfFieldInfoFromJson(_ json: String) throws -> DbfFieldInfo {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DbfFieldInfo.self, from: data)
|
||
}
|
||
|
||
public func jatsMetadataFromJson(_ json: String) throws -> JatsMetadata {
|
||
return try RustBridge.jatsMetadataFromJson(json)
|
||
}
|
||
|
||
public func contributorRoleFromJson(_ json: String) throws -> ContributorRole {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ContributorRole.self, from: data)
|
||
}
|
||
|
||
public func epubMetadataFromJson(_ json: String) throws -> EpubMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(EpubMetadata.self, from: data)
|
||
}
|
||
|
||
public func pstMetadataFromJson(_ json: String) throws -> PstMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PstMetadata.self, from: data)
|
||
}
|
||
|
||
public func ocrConfidenceFromJson(_ json: String) throws -> OcrConfidence {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrConfidence.self, from: data)
|
||
}
|
||
|
||
public func ocrRotationFromJson(_ json: String) throws -> OcrRotation {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrRotation.self, from: data)
|
||
}
|
||
|
||
public func ocrElementFromJson(_ json: String) throws -> OcrElement {
|
||
return try RustBridge.ocrElementFromJson(json)
|
||
}
|
||
|
||
public func ocrElementConfigFromJson(_ json: String) throws -> OcrElementConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrElementConfig.self, from: data)
|
||
}
|
||
|
||
public func pageStructureFromJson(_ json: String) throws -> PageStructure {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageStructure.self, from: data)
|
||
}
|
||
|
||
public func pageBoundaryFromJson(_ json: String) throws -> PageBoundary {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageBoundary.self, from: data)
|
||
}
|
||
|
||
public func pageInfoFromJson(_ json: String) throws -> PageInfo {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageInfo.self, from: data)
|
||
}
|
||
|
||
public func pageContentFromJson(_ json: String) throws -> PageContent {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageContent.self, from: data)
|
||
}
|
||
|
||
public func layoutRegionFromJson(_ json: String) throws -> LayoutRegion {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LayoutRegion.self, from: data)
|
||
}
|
||
|
||
public func pageHierarchyFromJson(_ json: String) throws -> PageHierarchy {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageHierarchy.self, from: data)
|
||
}
|
||
|
||
public func hierarchicalBlockFromJson(_ json: String) throws -> HierarchicalBlock {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HierarchicalBlock.self, from: data)
|
||
}
|
||
|
||
public func cellChangeFromJson(_ json: String) throws -> CellChange {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CellChange.self, from: data)
|
||
}
|
||
|
||
public func documentRevisionFromJson(_ json: String) throws -> DocumentRevision {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DocumentRevision.self, from: data)
|
||
}
|
||
|
||
public func revisionDeltaFromJson(_ json: String) throws -> RevisionDelta {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RevisionDelta.self, from: data)
|
||
}
|
||
|
||
public func tableFromJson(_ json: String) throws -> Table {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(Table.self, from: data)
|
||
}
|
||
|
||
public func tableCellFromJson(_ json: String) throws -> TableCell {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TableCell.self, from: data)
|
||
}
|
||
|
||
public func extractedUriFromJson(_ json: String) throws -> ExtractedUri {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ExtractedUri.self, from: data)
|
||
}
|
||
|
||
public func detectResponseFromJson(_ json: String) throws -> DetectResponse {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DetectResponse.self, from: data)
|
||
}
|
||
|
||
public func diffOptionsFromJson(_ json: String) throws -> DiffOptions {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DiffOptions.self, from: data)
|
||
}
|
||
|
||
public func extractionDiffFromJson(_ json: String) throws -> ExtractionDiff {
|
||
return try RustBridge.extractionDiffFromJson(json)
|
||
}
|
||
|
||
public func diffHunkFromJson(_ json: String) throws -> DiffHunk {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DiffHunk.self, from: data)
|
||
}
|
||
|
||
public func tableDiffFromJson(_ json: String) throws -> TableDiff {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TableDiff.self, from: data)
|
||
}
|
||
|
||
public func embeddedChangesFromJson(_ json: String) throws -> EmbeddedChanges {
|
||
return try RustBridge.embeddedChangesFromJson(json)
|
||
}
|
||
|
||
public func embeddedDiffFromJson(_ json: String) throws -> EmbeddedDiff {
|
||
return try RustBridge.embeddedDiffFromJson(json)
|
||
}
|
||
|
||
public func embeddingPresetFromJson(_ json: String) throws -> EmbeddingPreset {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(EmbeddingPreset.self, from: data)
|
||
}
|
||
|
||
public func yakeParamsFromJson(_ json: String) throws -> YakeParams {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(YakeParams.self, from: data)
|
||
}
|
||
|
||
public func rakeParamsFromJson(_ json: String) throws -> RakeParams {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RakeParams.self, from: data)
|
||
}
|
||
|
||
public func keywordConfigFromJson(_ json: String) throws -> KeywordConfig {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(KeywordConfig.self, from: data)
|
||
}
|
||
|
||
public func keywordFromJson(_ json: String) throws -> Keyword {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(Keyword.self, from: data)
|
||
}
|
||
|
||
public func paddleOcrConfigFromJson(_ json: String) throws -> PaddleOcrConfig {
|
||
return try RustBridge.paddleOcrConfigFromJson(json)
|
||
}
|
||
|
||
public func modelPathsFromJson(_ json: String) throws -> ModelPaths {
|
||
return try RustBridge.modelPathsFromJson(json)
|
||
}
|
||
|
||
public func orientationResultFromJson(_ json: String) throws -> OrientationResult {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OrientationResult.self, from: data)
|
||
}
|
||
|
||
public func bBoxFromJson(_ json: String) throws -> BBox {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(BBox.self, from: data)
|
||
}
|
||
|
||
public func layoutDetectionFromJson(_ json: String) throws -> LayoutDetection {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LayoutDetection.self, from: data)
|
||
}
|
||
|
||
public func recognizedTableFromJson(_ json: String) throws -> RecognizedTable {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RecognizedTable.self, from: data)
|
||
}
|
||
|
||
public func detectionResultFromJson(_ json: String) throws -> DetectionResult {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DetectionResult.self, from: data)
|
||
}
|
||
|
||
public func embeddedFileFromJson(_ json: String) throws -> EmbeddedFile {
|
||
return try RustBridge.embeddedFileFromJson(json)
|
||
}
|
||
|
||
public func pdfMetadataFromJson(_ json: String) throws -> PdfMetadata {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PdfMetadata.self, from: data)
|
||
}
|
||
|
||
public func executionProviderTypeFromJson(_ json: String) throws -> ExecutionProviderType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ExecutionProviderType.self, from: data)
|
||
}
|
||
|
||
public func outputFormatFromJson(_ json: String) throws -> OutputFormat {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OutputFormat.self, from: data)
|
||
}
|
||
|
||
public func htmlThemeFromJson(_ json: String) throws -> HtmlTheme {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(HtmlTheme.self, from: data)
|
||
}
|
||
|
||
public func tableModelFromJson(_ json: String) throws -> TableModel {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TableModel.self, from: data)
|
||
}
|
||
|
||
public func chunkerTypeFromJson(_ json: String) throws -> ChunkerType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ChunkerType.self, from: data)
|
||
}
|
||
|
||
public func chunkSizingFromJson(_ json: String) throws -> ChunkSizing {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ChunkSizing.self, from: data)
|
||
}
|
||
|
||
public func embeddingModelTypeFromJson(_ json: String) throws -> EmbeddingModelType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(EmbeddingModelType.self, from: data)
|
||
}
|
||
|
||
public func codeContentModeFromJson(_ json: String) throws -> CodeContentMode {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(CodeContentMode.self, from: data)
|
||
}
|
||
|
||
public func ocrBackendTypeFromJson(_ json: String) throws -> OcrBackendType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrBackendType.self, from: data)
|
||
}
|
||
|
||
public func processingStageFromJson(_ json: String) throws -> ProcessingStage {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ProcessingStage.self, from: data)
|
||
}
|
||
|
||
public func reductionLevelFromJson(_ json: String) throws -> ReductionLevel {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ReductionLevel.self, from: data)
|
||
}
|
||
|
||
public func pdfAnnotationTypeFromJson(_ json: String) throws -> PdfAnnotationType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PdfAnnotationType.self, from: data)
|
||
}
|
||
|
||
public func blockTypeFromJson(_ json: String) throws -> BlockType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(BlockType.self, from: data)
|
||
}
|
||
|
||
public func inlineTypeFromJson(_ json: String) throws -> InlineType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(InlineType.self, from: data)
|
||
}
|
||
|
||
public func relationshipKindFromJson(_ json: String) throws -> RelationshipKind {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RelationshipKind.self, from: data)
|
||
}
|
||
|
||
public func contentLayerFromJson(_ json: String) throws -> ContentLayer {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ContentLayer.self, from: data)
|
||
}
|
||
|
||
public func nodeContentFromJson(_ json: String) throws -> NodeContent {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(NodeContent.self, from: data)
|
||
}
|
||
|
||
public func annotationKindFromJson(_ json: String) throws -> AnnotationKind {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(AnnotationKind.self, from: data)
|
||
}
|
||
|
||
public func extractionMethodFromJson(_ json: String) throws -> ExtractionMethod {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ExtractionMethod.self, from: data)
|
||
}
|
||
|
||
public func chunkTypeFromJson(_ json: String) throws -> ChunkType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ChunkType.self, from: data)
|
||
}
|
||
|
||
public func imageKindFromJson(_ json: String) throws -> ImageKind {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImageKind.self, from: data)
|
||
}
|
||
|
||
public func resultFormatFromJson(_ json: String) throws -> ResultFormat {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ResultFormat.self, from: data)
|
||
}
|
||
|
||
public func elementTypeFromJson(_ json: String) throws -> ElementType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ElementType.self, from: data)
|
||
}
|
||
|
||
public func formatMetadataFromJson(_ json: String) throws -> FormatMetadata {
|
||
return try RustBridge.formatMetadataFromJson(json)
|
||
}
|
||
|
||
public func textDirectionFromJson(_ json: String) throws -> TextDirection {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(TextDirection.self, from: data)
|
||
}
|
||
|
||
public func linkTypeFromJson(_ json: String) throws -> LinkType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LinkType.self, from: data)
|
||
}
|
||
|
||
public func imageTypeFromJson(_ json: String) throws -> ImageType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(ImageType.self, from: data)
|
||
}
|
||
|
||
public func structuredDataTypeFromJson(_ json: String) throws -> StructuredDataType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(StructuredDataType.self, from: data)
|
||
}
|
||
|
||
public func ocrBoundingGeometryFromJson(_ json: String) throws -> OcrBoundingGeometry {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrBoundingGeometry.self, from: data)
|
||
}
|
||
|
||
public func ocrElementLevelFromJson(_ json: String) throws -> OcrElementLevel {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(OcrElementLevel.self, from: data)
|
||
}
|
||
|
||
public func pageUnitTypeFromJson(_ json: String) throws -> PageUnitType {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PageUnitType.self, from: data)
|
||
}
|
||
|
||
public func diffLineFromJson(_ json: String) throws -> DiffLine {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(DiffLine.self, from: data)
|
||
}
|
||
|
||
public func revisionKindFromJson(_ json: String) throws -> RevisionKind {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RevisionKind.self, from: data)
|
||
}
|
||
|
||
public func revisionAnchorFromJson(_ json: String) throws -> RevisionAnchor {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(RevisionAnchor.self, from: data)
|
||
}
|
||
|
||
public func uriKindFromJson(_ json: String) throws -> UriKind {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(UriKind.self, from: data)
|
||
}
|
||
|
||
public func keywordAlgorithmFromJson(_ json: String) throws -> KeywordAlgorithm {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(KeywordAlgorithm.self, from: data)
|
||
}
|
||
|
||
public func psmModeFromJson(_ json: String) throws -> PSMMode {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PSMMode.self, from: data)
|
||
}
|
||
|
||
public func paddleLanguageFromJson(_ json: String) throws -> PaddleLanguage {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(PaddleLanguage.self, from: data)
|
||
}
|
||
|
||
public func layoutClassFromJson(_ json: String) throws -> LayoutClass {
|
||
let data = json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode(LayoutClass.self, from: data)
|
||
}
|
||
|
||
// MARK: - Free-function Forwarders
|
||
// Re-export every public free function on the source Rust crate as a
|
||
// top-level `public func` on the host module so consumers do not need to
|
||
// `import RustBridge` directly. Forwarders take Swift-native parameter
|
||
// types and convert to the swift-bridge runtime types internally.
|
||
|
||
/// Synchronous wrapper for `extract_file`.
|
||
///
|
||
/// This is a convenience function that blocks the current thread until extraction completes.
|
||
/// For async code, use `extract_file` directly.
|
||
///
|
||
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
/// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
||
///
|
||
/// This function is only available with the `tokio-runtime` feature. For WASM targets,
|
||
/// use a truly synchronous extraction approach instead.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::extract_file_sync;
|
||
/// use kreuzberg::core::config::ExtractionConfig;
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let result = extract_file_sync("document.pdf", None, &config)?;
|
||
/// println!("Content: {}", result.content);
|
||
/// ```
|
||
public func extractFileSync(path: String, mimeType: String?, config: ExtractionConfig) throws -> ExtractionResult {
|
||
return try RustBridge.extractFileSync(path, mimeType, config)
|
||
}
|
||
|
||
/// Synchronous wrapper for `extract_bytes`.
|
||
///
|
||
/// Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||
/// a new runtime per call.
|
||
///
|
||
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::extract_bytes_sync;
|
||
/// use kreuzberg::core::config::ExtractionConfig;
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let bytes = b"Hello, world!";
|
||
/// let result = extract_bytes_sync(bytes, "text/plain", &config)?;
|
||
/// println!("Content: {}", result.content);
|
||
/// ```
|
||
public func extractBytesSync(content: [UInt8], mimeType: String, config: ExtractionConfig) throws -> ExtractionResult {
|
||
let _rb_content: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in content { v.push(value: b) }; return v }()
|
||
return try RustBridge.extractBytesSync(_rb_content, mimeType, config)
|
||
}
|
||
|
||
/// Synchronous wrapper for `batch_extract_files`.
|
||
///
|
||
/// Uses the global Tokio runtime for optimal performance.
|
||
/// Only available with `tokio-runtime` (WASM has no filesystem).
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_files_sync;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchFileItem {
|
||
/// path: "doc1.pdf".into(),
|
||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||
/// },
|
||
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
||
/// ];
|
||
/// let results = batch_extract_files_sync(items, &config)?;
|
||
/// ```
|
||
public func batchExtractFilesSync(items: [BatchFileItem], config: ExtractionConfig) throws -> [ExtractionResult] {
|
||
let _rb_items: RustVec<BatchFileItem> = { let v = RustVec<BatchFileItem>(); for x in items { v.push(value: x) }; return v }()
|
||
return try RustBridge.batchExtractFilesSync(_rb_items, config).map { ref in var item = try RustBridge.ExtractionResult(ptr: ref.ptr); item.isOwned = false; return item }
|
||
}
|
||
|
||
/// Synchronous wrapper for `batch_extract_bytes`.
|
||
///
|
||
/// Uses the global Tokio runtime for optimal performance.
|
||
/// With the `tokio-runtime` feature, this blocks the current thread using the global
|
||
/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
||
/// that iterates through items and calls `extract_bytes_sync()`.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_bytes_sync;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||
/// BatchBytesItem {
|
||
/// content: b"other".to_vec(),
|
||
/// mime_type: "text/plain".to_string(),
|
||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||
/// },
|
||
/// ];
|
||
/// let results = batch_extract_bytes_sync(items, &config)?;
|
||
/// ```
|
||
public func batchExtractBytesSync(items: [BatchBytesItem], config: ExtractionConfig) throws -> [ExtractionResult] {
|
||
let _rb_items: RustVec<BatchBytesItem> = { let v = RustVec<BatchBytesItem>(); for x in items { v.push(value: x) }; return v }()
|
||
return try RustBridge.batchExtractBytesSync(_rb_items, config).map { ref in var item = try RustBridge.ExtractionResult(ptr: ref.ptr); item.isOwned = false; return item }
|
||
}
|
||
|
||
/// Extract content from multiple files concurrently.
|
||
///
|
||
/// This function processes multiple files in parallel, automatically managing
|
||
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
/// to `(num_cpus * 1.5).ceil()`.
|
||
///
|
||
/// Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
/// fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
||
/// Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
||
/// taken from the batch-level `config`.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `items` - Vector of `BatchFileItem` structs, each containing a path and optional
|
||
/// per-file configuration overrides.
|
||
/// * `config` - Batch-level extraction configuration (provides defaults and batch settings)
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of `ExtractionResult` in the same order as the input items.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Individual file errors are captured in the result metadata. System errors
|
||
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// Simple usage with no per-file overrides:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_files;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem};
|
||
/// use std::path::PathBuf;
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchFileItem { path: "doc1.pdf".into(), config: None },
|
||
/// BatchFileItem { path: "doc2.pdf".into(), config: None },
|
||
/// ];
|
||
/// let results = batch_extract_files(items, &config).await?;
|
||
/// println!("Processed {} files", results.len());
|
||
/// ```
|
||
///
|
||
/// Per-file configuration overrides:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_files;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchFileItem, FileExtractionConfig};
|
||
/// use std::path::PathBuf;
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchFileItem {
|
||
/// path: "scan.pdf".into(),
|
||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||
/// },
|
||
/// BatchFileItem { path: "notes.txt".into(), config: None },
|
||
/// ];
|
||
/// let results = batch_extract_files(items, &config).await?;
|
||
/// ```
|
||
public func batchExtractFiles(items: [BatchFileItem], config: ExtractionConfig) async throws -> [ExtractionResult] {
|
||
let _rb_items: RustVec<BatchFileItem> = { let v = RustVec<BatchFileItem>(); for x in items { v.push(value: x) }; return v }()
|
||
return try await Task.detached(priority: .userInitiated) {
|
||
let result = try RustBridge.batchExtractFiles(_rb_items, config)
|
||
var items: [[ExtractionResult]] = []
|
||
for ref in result {
|
||
var item = try RustBridge.ExtractionResult(ptr: ref.ptr)
|
||
item.isOwned = false
|
||
items.append(item)
|
||
}
|
||
return items
|
||
}.value
|
||
}
|
||
|
||
/// Extract content from multiple byte arrays concurrently.
|
||
///
|
||
/// This function processes multiple byte arrays in parallel, automatically managing
|
||
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
||
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||
/// to `(num_cpus * 1.5).ceil()`.
|
||
///
|
||
/// Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||
/// fields from the batch-level `config`. Pass `None` as the config to use
|
||
/// the batch-level defaults for that item.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `items` - Vector of `BatchBytesItem` structs, each containing content bytes,
|
||
/// MIME type, and optional per-item configuration overrides.
|
||
/// * `config` - Batch-level extraction configuration
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of `ExtractionResult` in the same order as the input items.
|
||
///
|
||
/// # Examples
|
||
///
|
||
/// Simple usage with no per-item overrides:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_bytes;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem};
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchBytesItem { content: b"content 1".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||
/// BatchBytesItem { content: b"content 2".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||
/// ];
|
||
/// let results = batch_extract_bytes(items, &config).await?;
|
||
/// println!("Processed {} items", results.len());
|
||
/// ```
|
||
///
|
||
/// Per-item configuration overrides:
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::core::extractor::batch_extract_bytes;
|
||
/// use kreuzberg::core::config::{ExtractionConfig, BatchBytesItem, FileExtractionConfig};
|
||
///
|
||
/// let config = ExtractionConfig::default();
|
||
/// let items = vec![
|
||
/// BatchBytesItem { content: b"content".to_vec(), mime_type: "text/plain".to_string(), config: None },
|
||
/// BatchBytesItem {
|
||
/// content: b"<html>test</html>".to_vec(),
|
||
/// mime_type: "text/html".to_string(),
|
||
/// config: Some(FileExtractionConfig { force_ocr: Some(true), ..Default::default() }),
|
||
/// },
|
||
/// ];
|
||
/// let results = batch_extract_bytes(items, &config).await?;
|
||
/// ```
|
||
public func batchExtractBytes(items: [BatchBytesItem], config: ExtractionConfig) async throws -> [ExtractionResult] {
|
||
let _rb_items: RustVec<BatchBytesItem> = { let v = RustVec<BatchBytesItem>(); for x in items { v.push(value: x) }; return v }()
|
||
return try await Task.detached(priority: .userInitiated) {
|
||
let result = try RustBridge.batchExtractBytes(_rb_items, config)
|
||
var items: [[ExtractionResult]] = []
|
||
for ref in result {
|
||
var item = try RustBridge.ExtractionResult(ptr: ref.ptr)
|
||
item.isOwned = false
|
||
items.append(item)
|
||
}
|
||
return items
|
||
}.value
|
||
}
|
||
|
||
/// Detect MIME type from raw file bytes.
|
||
///
|
||
/// Uses magic byte signatures to detect file type from content.
|
||
/// Falls back to `infer` crate for comprehensive detection.
|
||
///
|
||
/// For ZIP-based files, inspects contents to distinguish Office Open XML
|
||
/// formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `content` - Raw file bytes
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// The detected MIME type string.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type cannot be determined.
|
||
public func detectMimeTypeFromBytes(content: [UInt8]) throws -> String {
|
||
let _rb_content: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in content { v.push(value: b) }; return v }()
|
||
return try RustBridge.detectMimeTypeFromBytes(_rb_content).toString()
|
||
}
|
||
|
||
/// Get file extensions for a given MIME type.
|
||
///
|
||
/// Returns all known file extensions that map to the specified MIME type.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `mime_type` - The MIME type to look up
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of file extensions (without leading dot) for the MIME type.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// use kreuzberg::core::mime::get_extensions_for_mime;
|
||
///
|
||
/// let extensions = get_extensions_for_mime("application/pdf").unwrap();
|
||
/// assert_eq!(extensions, vec!["pdf"]);
|
||
///
|
||
/// let doc_extensions = get_extensions_for_mime("application/vnd.openxmlformats-officedocument.wordprocessingml.document").unwrap();
|
||
/// assert!(doc_extensions.contains(&"docx".to_string()));
|
||
/// ```
|
||
public func getExtensionsForMime(mimeType: String) throws -> [String] {
|
||
return try RustBridge.getExtensionsForMime(mimeType).map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List the names of all registered embedding backends.
|
||
///
|
||
/// Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
||
/// bindings.
|
||
public func listEmbeddingBackends() throws -> [String] {
|
||
return try RustBridge.listEmbeddingBackends().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List names of all registered document extractors.
|
||
public func listDocumentExtractors() throws -> [String] {
|
||
return try RustBridge.listDocumentExtractors().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List all registered OCR backends.
|
||
///
|
||
/// Returns the names of all OCR backends currently registered in the global registry.
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// A vector of OCR backend names.
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::plugins::list_ocr_backends;
|
||
///
|
||
/// let backends = list_ocr_backends()?;
|
||
/// for name in backends {
|
||
/// println!("Registered OCR backend: {}", name);
|
||
/// }
|
||
/// ```
|
||
public func listOcrBackends() throws -> [String] {
|
||
return try RustBridge.listOcrBackends().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List all registered post-processor names.
|
||
///
|
||
/// Returns a vector of all post-processor names currently registered in the
|
||
/// global registry.
|
||
///
|
||
/// # Returns
|
||
///
|
||
/// - `Ok(Vec<String>)` - Vector of post-processor names
|
||
/// - `Err(...)` if the registry lock is poisoned
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust
|
||
/// use kreuzberg::plugins::list_post_processors;
|
||
///
|
||
/// let processors = list_post_processors()?;
|
||
/// for name in processors {
|
||
/// println!("Registered post-processor: {}", name);
|
||
/// }
|
||
/// ```
|
||
public func listPostProcessors() throws -> [String] {
|
||
return try RustBridge.listPostProcessors().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List names of all registered renderers.
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns an error if the registry lock is poisoned.
|
||
public func listRenderers() throws -> [String] {
|
||
return try RustBridge.listRenderers().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// List names of all registered validators.
|
||
public func listValidators() throws -> [String] {
|
||
return try RustBridge.listValidators().map { $0.as_str().toString() }
|
||
}
|
||
|
||
/// Compare two extraction results and return a structured diff.
|
||
///
|
||
/// The comparison is purely structural — no I/O, no side effects. All fields
|
||
/// of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `a` — the "before" extraction result
|
||
/// * `b` — the "after" extraction result
|
||
/// * `opts` — controls which sections are compared and optional truncation
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,no_run
|
||
/// use kreuzberg::{ExtractionResult, diff::{compare, DiffOptions}};
|
||
///
|
||
/// let mut a = ExtractionResult::default();
|
||
/// let mut b = ExtractionResult::default();
|
||
/// a.content = "Hello world".to_string();
|
||
/// b.content = "Hello Rust".to_string();
|
||
///
|
||
/// let diff = compare(&a, &b, &DiffOptions::default());
|
||
/// assert_eq!(diff.content_diff.len(), 1);
|
||
/// ```
|
||
public func compare(a: ExtractionResult, b: ExtractionResult, opts: DiffOptions) throws -> ExtractionDiff {
|
||
let _rb_opts = try opts.intoRust()
|
||
return RustBridge.compare(a, b, _rb_opts)
|
||
}
|
||
|
||
/// Generate embeddings asynchronously for a list of text strings.
|
||
///
|
||
/// This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
||
/// ONNX inference work to a dedicated blocking thread pool via Tokio's
|
||
/// `spawn_blocking`, keeping the async executor free.
|
||
///
|
||
/// Returns one embedding vector per input text in the same order.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `texts` - Vec of strings to embed (owned, sent to blocking thread)
|
||
/// * `config` - Embedding configuration specifying model, batch size, and normalization
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// - `KreuzbergError::MissingDependency` if ONNX Runtime is not installed
|
||
/// - `KreuzbergError::Embedding` if the preset name is unknown, model download fails,
|
||
/// or the blocking inference task panics
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```rust,ignore
|
||
/// use kreuzberg::{embed_texts_async, EmbeddingConfig};
|
||
///
|
||
/// let embeddings = embed_texts_async(
|
||
/// vec!["Hello!".to_string()],
|
||
/// &EmbeddingConfig::default(),
|
||
/// ).await?;
|
||
/// ```
|
||
public func embedTextsAsync(texts: [String], config: EmbeddingConfig) async throws -> [[Float]] {
|
||
let _rb_texts: RustVec<RustString> = { let v = RustVec<RustString>(); for s in texts { v.push(value: RustString(s)) }; return v }()
|
||
return try await Task.detached(priority: .userInitiated) {
|
||
let _rb_result = try RustBridge.embedTextsAsync(_rb_texts, config).toString()
|
||
let _rb_data = _rb_result.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode([[Float]].self, from: _rb_data)
|
||
}.value
|
||
}
|
||
|
||
/// Render a single PDF page to PNG bytes.
|
||
///
|
||
/// Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
||
/// Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
||
///
|
||
/// # Arguments
|
||
///
|
||
/// * `pdf_bytes` - Raw PDF file bytes
|
||
/// * `page_index` - Zero-based page index
|
||
/// * `dpi` - Resolution in dots per inch (default: 150)
|
||
/// * `password` - Optional password for encrypted PDFs
|
||
///
|
||
/// # Errors
|
||
///
|
||
/// Returns `KreuzbergError::Parsing` if the PDF cannot be opened, authenticated,
|
||
/// or rendered, or if `page_index` is out of range.
|
||
public func renderPdfPageToPng(pdfBytes: [UInt8], pageIndex: UInt, dpi: Int32?, password: String?) throws -> [UInt8] {
|
||
let _rb_pdfBytes: RustVec<UInt8> = { let v = RustVec<UInt8>(); for b in pdfBytes { v.push(value: b) }; return v }()
|
||
return try RustBridge.renderPdfPageToPng(_rb_pdfBytes, pageIndex, dpi, password).map { $0 }
|
||
}
|
||
|
||
/// Detect the MIME type of a file at the given path.
|
||
///
|
||
/// Uses the file extension and optionally the file content to determine the MIME type.
|
||
/// Set `check_exists` to `true` to verify the file exists before detection.
|
||
public func detectMimeType(path: String, checkExists: Bool) throws -> String {
|
||
return try RustBridge.detectMimeType(path, checkExists).toString()
|
||
}
|
||
|
||
/// Embed a list of texts using the configured embedding model.
|
||
///
|
||
/// Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
||
public func embedTexts(texts: [String], config: EmbeddingConfig) throws -> [[Float]] {
|
||
let _rb_texts: RustVec<RustString> = { let v = RustVec<RustString>(); for s in texts { v.push(value: RustString(s)) }; return v }()
|
||
let _rb_json = try RustBridge.embedTexts(_rb_texts, config).toString()
|
||
let _rb_data = _rb_json.data(using: .utf8) ?? Data()
|
||
return try JSONDecoder().decode([[Float]].self, from: _rb_data)
|
||
}
|
||
|
||
/// Get an embedding preset by name.
|
||
///
|
||
/// Returns `None` if no preset with the given name exists. Returns an owned
|
||
/// clone so the value is safe to pass across FFI boundaries.
|
||
public func getEmbeddingPreset(name: String) throws -> EmbeddingPreset? {
|
||
return try RustBridge.getEmbeddingPreset(name).map { try EmbeddingPreset($0) }
|
||
}
|
||
|
||
/// List the names of all available embedding presets.
|
||
///
|
||
/// Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
||
public func listEmbeddingPresets() -> [String] {
|
||
return RustBridge.listEmbeddingPresets().map { $0.as_str().toString() }
|
||
}
|
||
|
||
// MARK: - Trait Bridge Registration Forwarders
|
||
// Top-level `public func` re-exports of the swift-bridge–generated
|
||
// `register_*` / `unregister_*` / `clear_*` plugin registration entry
|
||
// points so consumers do not need to `import RustBridge` for plugin work.
|
||
|
||
/// Register an inbound `OcrBackend` plugin implementation. The Swift
|
||
/// host wraps a `OcrBackend` conformer in a `SwiftOcrBackendBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerOcrBackend(_ swiftBox: SwiftOcrBackendBox) throws {
|
||
try RustBridge.registerOcrBackend(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `OcrBackend` plugin by name.
|
||
public func unregisterOcrBackend(_ name: String) throws {
|
||
try RustBridge.unregisterOcrBackend(name)
|
||
}
|
||
|
||
/// Remove every registered `OcrBackend` plugin. Typically used in test teardown.
|
||
public func clearOcrBackends() throws {
|
||
try RustBridge.clearOcrBackends()
|
||
}
|
||
|
||
/// Register an inbound `PostProcessor` plugin implementation. The Swift
|
||
/// host wraps a `PostProcessor` conformer in a `SwiftPostProcessorBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerPostProcessor(_ swiftBox: SwiftPostProcessorBox) throws {
|
||
try RustBridge.registerPostProcessor(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `PostProcessor` plugin by name.
|
||
public func unregisterPostProcessor(_ name: String) throws {
|
||
try RustBridge.unregisterPostProcessor(name)
|
||
}
|
||
|
||
/// Remove every registered `PostProcessor` plugin. Typically used in test teardown.
|
||
public func clearPostProcessors() throws {
|
||
try RustBridge.clearPostProcessors()
|
||
}
|
||
|
||
/// Register an inbound `Validator` plugin implementation. The Swift
|
||
/// host wraps a `Validator` conformer in a `SwiftValidatorBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerValidator(_ swiftBox: SwiftValidatorBox) throws {
|
||
try RustBridge.registerValidator(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `Validator` plugin by name.
|
||
public func unregisterValidator(_ name: String) throws {
|
||
try RustBridge.unregisterValidator(name)
|
||
}
|
||
|
||
/// Remove every registered `Validator` plugin. Typically used in test teardown.
|
||
public func clearValidators() throws {
|
||
try RustBridge.clearValidators()
|
||
}
|
||
|
||
/// Register an inbound `EmbeddingBackend` plugin implementation. The Swift
|
||
/// host wraps a `EmbeddingBackend` conformer in a `SwiftEmbeddingBackendBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerEmbeddingBackend(_ swiftBox: SwiftEmbeddingBackendBox) throws {
|
||
try RustBridge.registerEmbeddingBackend(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `EmbeddingBackend` plugin by name.
|
||
public func unregisterEmbeddingBackend(_ name: String) throws {
|
||
try RustBridge.unregisterEmbeddingBackend(name)
|
||
}
|
||
|
||
/// Remove every registered `EmbeddingBackend` plugin. Typically used in test teardown.
|
||
public func clearEmbeddingBackends() throws {
|
||
try RustBridge.clearEmbeddingBackends()
|
||
}
|
||
|
||
/// Register an inbound `DocumentExtractor` plugin implementation. The Swift
|
||
/// host wraps a `DocumentExtractor` conformer in a `SwiftDocumentExtractorBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerDocumentExtractor(_ swiftBox: SwiftDocumentExtractorBox) throws {
|
||
try RustBridge.registerDocumentExtractor(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `DocumentExtractor` plugin by name.
|
||
public func unregisterDocumentExtractor(_ name: String) throws {
|
||
try RustBridge.unregisterDocumentExtractor(name)
|
||
}
|
||
|
||
/// Remove every registered `DocumentExtractor` plugin. Typically used in test teardown.
|
||
public func clearDocumentExtractors() throws {
|
||
try RustBridge.clearDocumentExtractors()
|
||
}
|
||
|
||
/// Register an inbound `Renderer` plugin implementation. The Swift
|
||
/// host wraps a `Renderer` conformer in a `SwiftRendererBox` adapter
|
||
/// (see `Sources/RustBridge/Plugins.swift`); pass the wrapped instance to
|
||
/// register the plugin in the global registry.
|
||
public func registerRenderer(_ swiftBox: SwiftRendererBox) throws {
|
||
try RustBridge.registerRenderer(swiftBox)
|
||
}
|
||
|
||
/// Unregister a previously-registered `Renderer` plugin by name.
|
||
public func unregisterRenderer(_ name: String) throws {
|
||
try RustBridge.unregisterRenderer(name)
|
||
}
|
||
|
||
/// Remove every registered `Renderer` plugin. Typically used in test teardown.
|
||
public func clearRenderers() throws {
|
||
try RustBridge.clearRenderers()
|
||
}
|
||
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ContentFilterConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ExtractionConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ImageExtractionConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.TokenReductionOptions: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.LanguageDetectionConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.HtmlOutputConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.LayoutDetectionConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.OcrQualityThresholds: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.OcrConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.PageConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.PdfConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.HierarchyConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.PostProcessorConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ChunkingConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.EmbeddingConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.TreeSitterConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.TreeSitterProcessConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ServerConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.SecurityLimits: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.TokenReductionConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.DocumentStructure: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ExtractionResult: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ImagePreprocessingConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.TesseractConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.DiffOptions: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.ExtractionDiff: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.EmbeddingPreset: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.YakeParams: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.RakeParams: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.KeywordConfig: @unchecked Sendable {}
|
||
// swift-bridge opaque type used across Task.detached boundaries — Rust type is Send + Sync.
|
||
extension RustBridge.PaddleOcrConfig: @unchecked Sendable {}
|