Files
fil/packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IDocumentExtractor.kt
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

144 lines
4.4 KiB
Kotlin
Generated

// Generated by alef. Do not edit by hand.
@file:Suppress(
"ktlint:standard:trailing-comma-on-call-site",
"ktlint:standard:trailing-comma-on-declaration-site",
"ktlint:standard:spacing-between-declarations-with-comments",
"ktlint:standard:spacing-between-declarations-with-annotations",
"ktlint:standard:when-entry-bracing",
"ktlint:standard:blank-line-between-when-conditions",
"ktlint:standard:blank-line-before-declaration",
"ktlint:standard:chain-method-continuation",
"ktlint:standard:annotation",
"ktlint:standard:max-line-length",
"ktlint:standard:no-semi",
"ktlint:standard:statement-wrapping",
"MaxLineLength",
"TooManyFunctions",
"FunctionParameterNaming",
"LongParameterList",
"CyclomaticComplexMethod",
"LongMethod",
)
package dev.kreuzberg
import java.nio.file.Path
/**
* Trait for document extractor plugins.
*
* Implement this trait to add support for new document formats or to override
* built-in extraction behavior with custom logic.
*
* # Return Type
*
* Extractors return `InternalDocument`, a flat intermediate representation.
* The pipeline converts this into the public `ExtractionResult` via the
* derivation step.
*
* # Priority System
*
* When multiple extractors support the same MIME type, the registry selects
* the extractor with the highest priority value. Use this to:
*
* - Override built-in extractors (priority > 50)
* - Provide fallback extractors (priority < 50)
* - Implement specialized extractors for specific use cases
*
* Default priority is 50.
*
* # Thread Safety
*
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
*/
interface IDocumentExtractor {
fun name(): String
fun version(): String
fun initialize() {}
fun shutdown() {}
/**
* Extract content from a byte array.
*
* This is the core extraction method that processes in-memory document data.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
* The pipeline will convert this into the public `ExtractionResult`.
*
* **Errors:**
*
* - `KreuzbergError.Parsing` - Document parsing failed
* - `KreuzbergError.Validation` - Invalid document structure
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
* - `KreuzbergError.MissingDependency` - Required dependency not available
*/
suspend fun extractBytes(
content: ByteArray,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Extract content from a file.
*
* Default implementation reads the file and calls `extract_bytes`.
* Override for custom file handling, streaming, or memory optimizations.
*
* **Returns:**
*
* An `InternalDocument` containing the extracted elements, metadata, and tables.
*
* **Errors:**
*
* Same as `extract_bytes`, plus file I/O errors.
*/
suspend fun extractFile(
path: java.nio.file.Path,
mimeType: String,
config: ExtractionConfig,
): ExtractionResult
/**
* Get the list of MIME types supported by this extractor.
*
* Can include exact MIME types and prefix patterns:
*
* - Exact: `"application/pdf"`, `"text/plain"`
* - Prefix: `"image/*"` (matches any image type)
*
* **Returns:**
*
* A slice of MIME type strings.
*/
fun supportedMimeTypes(): List<String>
/**
* Get the priority of this extractor.
*
* Higher priority extractors are preferred when multiple extractors
* support the same MIME type.
*
* # Priority Guidelines
*
* - **0-25**: Fallback/low-quality extractors
* - **26-49**: Alternative extractors
* - **50**: Default priority (built-in extractors)
* - **51-75**: Premium/enhanced extractors
* - **76-100**: Specialized/high-priority extractors
*
* **Returns:**
*
* Priority value (default: 50)
*/
fun priority(): Int
/**
* Optional: Check if this extractor can handle a specific file.
*
* Allows for more sophisticated detection beyond MIME types.
* Defaults to `true` (rely on MIME type matching).
*
* **Returns:**
*
* `true` if the extractor can handle this file, `false` otherwise.
*/
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
}