This commit is contained in:
143
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IDocumentExtractor.kt
generated
Normal file
143
packages/kotlin-android/src/main/kotlin/dev/kreuzberg/IDocumentExtractor.kt
generated
Normal file
@@ -0,0 +1,143 @@
|
||||
// Generated by alef. Do not edit by hand.
|
||||
@file:Suppress(
|
||||
"ktlint:standard:trailing-comma-on-call-site",
|
||||
"ktlint:standard:trailing-comma-on-declaration-site",
|
||||
"ktlint:standard:spacing-between-declarations-with-comments",
|
||||
"ktlint:standard:spacing-between-declarations-with-annotations",
|
||||
"ktlint:standard:when-entry-bracing",
|
||||
"ktlint:standard:blank-line-between-when-conditions",
|
||||
"ktlint:standard:blank-line-before-declaration",
|
||||
"ktlint:standard:chain-method-continuation",
|
||||
"ktlint:standard:annotation",
|
||||
"ktlint:standard:max-line-length",
|
||||
"ktlint:standard:no-semi",
|
||||
"ktlint:standard:statement-wrapping",
|
||||
"MaxLineLength",
|
||||
"TooManyFunctions",
|
||||
"FunctionParameterNaming",
|
||||
"LongParameterList",
|
||||
"CyclomaticComplexMethod",
|
||||
"LongMethod",
|
||||
)
|
||||
|
||||
package dev.kreuzberg
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
/**
|
||||
* Trait for document extractor plugins.
|
||||
*
|
||||
* Implement this trait to add support for new document formats or to override
|
||||
* built-in extraction behavior with custom logic.
|
||||
*
|
||||
* # Return Type
|
||||
*
|
||||
* Extractors return `InternalDocument`, a flat intermediate representation.
|
||||
* The pipeline converts this into the public `ExtractionResult` via the
|
||||
* derivation step.
|
||||
*
|
||||
* # Priority System
|
||||
*
|
||||
* When multiple extractors support the same MIME type, the registry selects
|
||||
* the extractor with the highest priority value. Use this to:
|
||||
*
|
||||
* - Override built-in extractors (priority > 50)
|
||||
* - Provide fallback extractors (priority < 50)
|
||||
* - Implement specialized extractors for specific use cases
|
||||
*
|
||||
* Default priority is 50.
|
||||
*
|
||||
* # Thread Safety
|
||||
*
|
||||
* Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
||||
*/
|
||||
interface IDocumentExtractor {
|
||||
fun name(): String
|
||||
fun version(): String
|
||||
fun initialize() {}
|
||||
fun shutdown() {}
|
||||
/**
|
||||
* Extract content from a byte array.
|
||||
*
|
||||
* This is the core extraction method that processes in-memory document data.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
* The pipeline will convert this into the public `ExtractionResult`.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* - `KreuzbergError.Parsing` - Document parsing failed
|
||||
* - `KreuzbergError.Validation` - Invalid document structure
|
||||
* - `KreuzbergError.Io` - I/O errors (these always bubble up)
|
||||
* - `KreuzbergError.MissingDependency` - Required dependency not available
|
||||
*/
|
||||
suspend fun extractBytes(
|
||||
content: ByteArray,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig,
|
||||
): ExtractionResult
|
||||
/**
|
||||
* Extract content from a file.
|
||||
*
|
||||
* Default implementation reads the file and calls `extract_bytes`.
|
||||
* Override for custom file handling, streaming, or memory optimizations.
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* An `InternalDocument` containing the extracted elements, metadata, and tables.
|
||||
*
|
||||
* **Errors:**
|
||||
*
|
||||
* Same as `extract_bytes`, plus file I/O errors.
|
||||
*/
|
||||
suspend fun extractFile(
|
||||
path: java.nio.file.Path,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig,
|
||||
): ExtractionResult
|
||||
/**
|
||||
* Get the list of MIME types supported by this extractor.
|
||||
*
|
||||
* Can include exact MIME types and prefix patterns:
|
||||
*
|
||||
* - Exact: `"application/pdf"`, `"text/plain"`
|
||||
* - Prefix: `"image/*"` (matches any image type)
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* A slice of MIME type strings.
|
||||
*/
|
||||
fun supportedMimeTypes(): List<String>
|
||||
/**
|
||||
* Get the priority of this extractor.
|
||||
*
|
||||
* Higher priority extractors are preferred when multiple extractors
|
||||
* support the same MIME type.
|
||||
*
|
||||
* # Priority Guidelines
|
||||
*
|
||||
* - **0-25**: Fallback/low-quality extractors
|
||||
* - **26-49**: Alternative extractors
|
||||
* - **50**: Default priority (built-in extractors)
|
||||
* - **51-75**: Premium/enhanced extractors
|
||||
* - **76-100**: Specialized/high-priority extractors
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* Priority value (default: 50)
|
||||
*/
|
||||
fun priority(): Int
|
||||
/**
|
||||
* Optional: Check if this extractor can handle a specific file.
|
||||
*
|
||||
* Allows for more sophisticated detection beyond MIME types.
|
||||
* Defaults to `true` (rely on MIME type matching).
|
||||
*
|
||||
* **Returns:**
|
||||
*
|
||||
* `true` if the extractor can handle this file, `false` otherwise.
|
||||
*/
|
||||
fun canHandle(path: java.nio.file.Path, mimeType: String): Boolean
|
||||
}
|
||||
Reference in New Issue
Block a user