// Generated by alef. Do not edit by hand. @file:Suppress( "ktlint:standard:trailing-comma-on-call-site", "ktlint:standard:trailing-comma-on-declaration-site", "ktlint:standard:spacing-between-declarations-with-comments", "ktlint:standard:spacing-between-declarations-with-annotations", "ktlint:standard:when-entry-bracing", "ktlint:standard:blank-line-between-when-conditions", "ktlint:standard:blank-line-before-declaration", "ktlint:standard:chain-method-continuation", "ktlint:standard:annotation", "ktlint:standard:max-line-length", "ktlint:standard:no-semi", "ktlint:standard:statement-wrapping", "MaxLineLength", "TooManyFunctions", "FunctionParameterNaming", "LongParameterList", "CyclomaticComplexMethod", "LongMethod", ) package dev.kreuzberg import java.nio.file.Path /** * Trait for document extractor plugins. * * Implement this trait to add support for new document formats or to override * built-in extraction behavior with custom logic. * * # Return Type * * Extractors return `InternalDocument`, a flat intermediate representation. * The pipeline converts this into the public `ExtractionResult` via the * derivation step. * * # Priority System * * When multiple extractors support the same MIME type, the registry selects * the extractor with the highest priority value. Use this to: * * - Override built-in extractors (priority > 50) * - Provide fallback extractors (priority < 50) * - Implement specialized extractors for specific use cases * * Default priority is 50. * * # Thread Safety * * Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction. */ interface IDocumentExtractor { fun name(): String fun version(): String fun initialize() {} fun shutdown() {} /** * Extract content from a byte array. * * This is the core extraction method that processes in-memory document data. * * **Returns:** * * An `InternalDocument` containing the extracted elements, metadata, and tables. * The pipeline will convert this into the public `ExtractionResult`. * * **Errors:** * * - `KreuzbergError.Parsing` - Document parsing failed * - `KreuzbergError.Validation` - Invalid document structure * - `KreuzbergError.Io` - I/O errors (these always bubble up) * - `KreuzbergError.MissingDependency` - Required dependency not available */ suspend fun extractBytes( content: ByteArray, mimeType: String, config: ExtractionConfig, ): ExtractionResult /** * Extract content from a file. * * Default implementation reads the file and calls `extract_bytes`. * Override for custom file handling, streaming, or memory optimizations. * * **Returns:** * * An `InternalDocument` containing the extracted elements, metadata, and tables. * * **Errors:** * * Same as `extract_bytes`, plus file I/O errors. */ suspend fun extractFile( path: java.nio.file.Path, mimeType: String, config: ExtractionConfig, ): ExtractionResult /** * Get the list of MIME types supported by this extractor. * * Can include exact MIME types and prefix patterns: * * - Exact: `"application/pdf"`, `"text/plain"` * - Prefix: `"image/*"` (matches any image type) * * **Returns:** * * A slice of MIME type strings. */ fun supportedMimeTypes(): List /** * Get the priority of this extractor. * * Higher priority extractors are preferred when multiple extractors * support the same MIME type. * * # Priority Guidelines * * - **0-25**: Fallback/low-quality extractors * - **26-49**: Alternative extractors * - **50**: Default priority (built-in extractors) * - **51-75**: Premium/enhanced extractors * - **76-100**: Specialized/high-priority extractors * * **Returns:** * * Priority value (default: 50) */ fun priority(): Int /** * Optional: Check if this extractor can handle a specific file. * * Allows for more sophisticated detection beyond MIME types. * Defaults to `true` (rely on MIME type matching). * * **Returns:** * * `true` if the extractor can handle this file, `false` otherwise. */ fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean }