144 lines
4.4 KiB
Kotlin
144 lines
4.4 KiB
Kotlin
|
|
// Generated by alef. Do not edit by hand.
|
||
|
|
@file:Suppress(
|
||
|
|
"ktlint:standard:trailing-comma-on-call-site",
|
||
|
|
"ktlint:standard:trailing-comma-on-declaration-site",
|
||
|
|
"ktlint:standard:spacing-between-declarations-with-comments",
|
||
|
|
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||
|
|
"ktlint:standard:when-entry-bracing",
|
||
|
|
"ktlint:standard:blank-line-between-when-conditions",
|
||
|
|
"ktlint:standard:blank-line-before-declaration",
|
||
|
|
"ktlint:standard:chain-method-continuation",
|
||
|
|
"ktlint:standard:annotation",
|
||
|
|
"ktlint:standard:max-line-length",
|
||
|
|
"ktlint:standard:no-semi",
|
||
|
|
"ktlint:standard:statement-wrapping",
|
||
|
|
"MaxLineLength",
|
||
|
|
"TooManyFunctions",
|
||
|
|
"FunctionParameterNaming",
|
||
|
|
"LongParameterList",
|
||
|
|
"CyclomaticComplexMethod",
|
||
|
|
"LongMethod",
|
||
|
|
)
|
||
|
|
|
||
|
|
package dev.kreuzberg
|
||
|
|
|
||
|
|
import java.nio.file.Path
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Trait for document extractor plugins.
|
||
|
|
*
|
||
|
|
* Implement this trait to add support for new document formats or to override
|
||
|
|
* built-in extraction behavior with custom logic.
|
||
|
|
*
|
||
|
|
* # Return Type
|
||
|
|
*
|
||
|
|
* Extractors return `InternalDocument`, a flat intermediate representation.
|
||
|
|
* The pipeline converts this into the public `ExtractionResult` via the
|
||
|
|
* derivation step.
|
||
|
|
*
|
||
|
|
* # Priority System
|
||
|
|
*
|
||
|
|
* When multiple extractors support the same MIME type, the registry selects
|
||
|
|
* the extractor with the highest priority value. Use this to:
|
||
|
|
*
|
||
|
|
* - Override built-in extractors (priority > 50)
|
||
|
|
* - Provide fallback extractors (priority < 50)
|
||
|
|
* - Implement specialized extractors for specific use cases
|
||
|
|
*
|
||
|
|
* Default priority is 50.
|
||
|
|
*
|
||
|
|
* # Thread Safety
|
||
|
|
*
|
||
|
|
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
||
|
|
*/
|
||
|
|
interface IDocumentExtractor {
|
||
|
|
fun name(): String
|
||
|
|
fun version(): String
|
||
|
|
fun initialize() {}
|
||
|
|
fun shutdown() {}
|
||
|
|
/**
|
||
|
|
* Extract content from a byte array.
|
||
|
|
*
|
||
|
|
* This is the core extraction method that processes in-memory document data.
|
||
|
|
*
|
||
|
|
* **Returns:**
|
||
|
|
*
|
||
|
|
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||
|
|
* The pipeline will convert this into the public `ExtractionResult`.
|
||
|
|
*
|
||
|
|
* **Errors:**
|
||
|
|
*
|
||
|
|
* - `KreuzbergError.Parsing` - Document parsing failed
|
||
|
|
* - `KreuzbergError.Validation` - Invalid document structure
|
||
|
|
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
|
||
|
|
* - `KreuzbergError.MissingDependency` - Required dependency not available
|
||
|
|
*/
|
||
|
|
suspend fun extractBytes(
|
||
|
|
content: ByteArray,
|
||
|
|
mimeType: String,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
): ExtractionResult
|
||
|
|
/**
|
||
|
|
* Extract content from a file.
|
||
|
|
*
|
||
|
|
* Default implementation reads the file and calls `extract_bytes`.
|
||
|
|
* Override for custom file handling, streaming, or memory optimizations.
|
||
|
|
*
|
||
|
|
* **Returns:**
|
||
|
|
*
|
||
|
|
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||
|
|
*
|
||
|
|
* **Errors:**
|
||
|
|
*
|
||
|
|
* Same as `extract_bytes`, plus file I/O errors.
|
||
|
|
*/
|
||
|
|
suspend fun extractFile(
|
||
|
|
path: java.nio.file.Path,
|
||
|
|
mimeType: String,
|
||
|
|
config: ExtractionConfig,
|
||
|
|
): ExtractionResult
|
||
|
|
/**
|
||
|
|
* Get the list of MIME types supported by this extractor.
|
||
|
|
*
|
||
|
|
* Can include exact MIME types and prefix patterns:
|
||
|
|
*
|
||
|
|
* - Exact: `"application/pdf"`, `"text/plain"`
|
||
|
|
* - Prefix: `"image/*"` (matches any image type)
|
||
|
|
*
|
||
|
|
* **Returns:**
|
||
|
|
*
|
||
|
|
* A slice of MIME type strings.
|
||
|
|
*/
|
||
|
|
fun supportedMimeTypes(): List<String>
|
||
|
|
/**
|
||
|
|
* Get the priority of this extractor.
|
||
|
|
*
|
||
|
|
* Higher priority extractors are preferred when multiple extractors
|
||
|
|
* support the same MIME type.
|
||
|
|
*
|
||
|
|
* # Priority Guidelines
|
||
|
|
*
|
||
|
|
* - **0-25**: Fallback/low-quality extractors
|
||
|
|
* - **26-49**: Alternative extractors
|
||
|
|
* - **50**: Default priority (built-in extractors)
|
||
|
|
* - **51-75**: Premium/enhanced extractors
|
||
|
|
* - **76-100**: Specialized/high-priority extractors
|
||
|
|
*
|
||
|
|
* **Returns:**
|
||
|
|
*
|
||
|
|
* Priority value (default: 50)
|
||
|
|
*/
|
||
|
|
fun priority(): Int
|
||
|
|
/**
|
||
|
|
* Optional: Check if this extractor can handle a specific file.
|
||
|
|
*
|
||
|
|
* Allows for more sophisticated detection beyond MIME types.
|
||
|
|
* Defaults to `true` (rely on MIME type matching).
|
||
|
|
*
|
||
|
|
* **Returns:**
|
||
|
|
*
|
||
|
|
* `true` if the extractor can handle this file, `false` otherwise.
|
||
|
|
*/
|
||
|
|
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
|
||
|
|
}
|