This commit is contained in:
14
docs/snippets/kotlin/plugins/clear_plugins.md
Normal file
14
docs/snippets/kotlin/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
fun clearAllPlugins() {
|
||||
// Note: there is no Kreuzberg.clearDocumentExtractors() — extractor
|
||||
// registration is not exposed through the Kotlin/Java plugin bridge.
|
||||
Kreuzberg.clearPostProcessors()
|
||||
Kreuzberg.clearOcrBackends()
|
||||
Kreuzberg.clearValidators()
|
||||
|
||||
println("All post-processors, OCR backends, and validators cleared")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/plugins/embedding_backend.md
Normal file
22
docs/snippets/kotlin/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
// Wrap a host-language embedding model so kreuzberg can call back into it
|
||||
// during chunking and standalone embed requests.
|
||||
class MyEmbedder(private val dim: Long = 768L) : IEmbeddingBackend {
|
||||
override fun name(): String = "my-embedder"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun dimensions(): Long = dim
|
||||
|
||||
override fun embed(texts: List<String>): List<List<Float>> {
|
||||
// Replace this with a real model invocation. Each inner list must
|
||||
// have exactly `dimensions()` elements — the bridge validates shape.
|
||||
return texts.map { List(dim.toInt()) { 0.0f } }
|
||||
}
|
||||
}
|
||||
|
||||
fun registerMyEmbedder() {
|
||||
EmbeddingBackendBridge.registerEmbeddingBackend(MyEmbedder())
|
||||
}
|
||||
```
|
||||
21
docs/snippets/kotlin/plugins/extractor_registration.md
Normal file
21
docs/snippets/kotlin/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
// The Kotlin/Java plugin bridge does not expose an IDocumentExtractor interface
|
||||
// — extractor registration lives in the Rust core. From Kotlin you can list
|
||||
// the extractors that are already registered and route extraction through the
|
||||
// existing facade.
|
||||
fun useRegisteredExtractors() {
|
||||
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
|
||||
println("Available extractors: $extractors")
|
||||
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result: ExtractionResult = Kreuzberg.extractFileSync(
|
||||
java.nio.file.Path.of("document.pdf"),
|
||||
null,
|
||||
config,
|
||||
)
|
||||
println("Extracted ${result.content().length} characters via ${result.mimeType()}")
|
||||
}
|
||||
```
|
||||
18
docs/snippets/kotlin/plugins/list_plugins.md
Normal file
18
docs/snippets/kotlin/plugins/list_plugins.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
fun listAllPlugins() {
|
||||
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
|
||||
println("Registered extractors: $extractors")
|
||||
|
||||
val processors: List<String> = Kreuzberg.listPostProcessors()
|
||||
println("Registered post-processors: $processors")
|
||||
|
||||
val backends: List<String> = Kreuzberg.listOcrBackends()
|
||||
println("Registered OCR backends: $backends")
|
||||
|
||||
val validators: List<String> = Kreuzberg.listValidators()
|
||||
println("Registered validators: $validators")
|
||||
}
|
||||
```
|
||||
28
docs/snippets/kotlin/plugins/min_length_validator.md
Normal file
28
docs/snippets/kotlin/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class MinLengthValidator(private val minLength: Int) : IValidator {
|
||||
override fun name(): String = "min-length-validator"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val length = result.content().length
|
||||
if (length < minLength) {
|
||||
throw IllegalStateException(
|
||||
"Content too short: $length < $minLength characters",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun priority(): Int = 100
|
||||
}
|
||||
|
||||
fun registerMinLengthValidator() {
|
||||
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 100))
|
||||
}
|
||||
```
|
||||
38
docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
Normal file
38
docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
class PdfMetadataExtractor : IPostProcessor {
|
||||
private val processed = AtomicInteger(0)
|
||||
|
||||
override fun name(): String = "pdf-metadata-extractor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
if (result.mimeType() != "application/pdf") return
|
||||
|
||||
val count = processed.incrementAndGet()
|
||||
val metadata: Metadata = result.metadata()
|
||||
// Metadata is an immutable record — read PDF metadata fields rather
|
||||
// than mutate. Reporting via stdout/log keeps the snippet honest.
|
||||
println(
|
||||
"[pdf-metadata] #$count title=${metadata.title()} authors=${metadata.authors()}",
|
||||
)
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.mimeType() == "application/pdf"
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 2L
|
||||
|
||||
override fun priority(): Int = 25
|
||||
}
|
||||
|
||||
fun registerPdfMetadataExtractor() {
|
||||
PostProcessorBridge.registerPostProcessor(PdfMetadataExtractor())
|
||||
}
|
||||
```
|
||||
30
docs/snippets/kotlin/plugins/pdf_only_processor.md
Normal file
30
docs/snippets/kotlin/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class PdfOnlyProcessor : IPostProcessor {
|
||||
override fun name(): String = "pdf-only-processor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
// Guard inside process() in addition to should_process() — the gate
|
||||
// saves the JSON roundtrip when this returns false.
|
||||
if (result.mimeType() != "application/pdf") return
|
||||
println("[pdf-only] processing PDF (${result.content().length} chars)")
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.mimeType() == "application/pdf"
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 5L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerPdfOnlyProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(PdfOnlyProcessor())
|
||||
}
|
||||
```
|
||||
33
docs/snippets/kotlin/plugins/plugin_extractor.md
Normal file
33
docs/snippets/kotlin/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
|
||||
// The Kotlin/Java bindings expose plugin bridges for IPostProcessor,
|
||||
// IValidator, IOcrBackend, and IEmbeddingBackend. There is no
|
||||
// IDocumentExtractor bridge — extractor selection happens entirely in the
|
||||
// Rust core based on MIME type. From Kotlin, the "extractor plugin" pattern
|
||||
// is to wrap Kreuzberg.extractBytes / extractFile and dispatch to the right
|
||||
// extractor by MIME.
|
||||
class GenericExtractorClient {
|
||||
suspend fun extractBytes(
|
||||
content: ByteArray,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig = ExtractionConfig.builder().build(),
|
||||
): ExtractionResult = Kreuzberg.extractBytes(content, mimeType, config)
|
||||
|
||||
suspend fun extractFile(
|
||||
path: Path,
|
||||
mimeType: String? = null,
|
||||
config: ExtractionConfig = ExtractionConfig.builder().build(),
|
||||
): ExtractionResult = Kreuzberg.extractFile(path, mimeType, config)
|
||||
}
|
||||
|
||||
suspend fun extractCustomPayload() {
|
||||
val client = GenericExtractorClient()
|
||||
val bytes = Files.readAllBytes(Path.of("payload.json"))
|
||||
val result = client.extractBytes(bytes, mimeType = "application/json")
|
||||
println("Extracted ${result.content().length} chars")
|
||||
}
|
||||
```
|
||||
41
docs/snippets/kotlin/plugins/plugin_logging.md
Normal file
41
docs/snippets/kotlin/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.logging.Logger
|
||||
|
||||
class LoggingPostProcessor : IPostProcessor {
|
||||
private val log: Logger = Logger.getLogger(LoggingPostProcessor::class.java.name)
|
||||
|
||||
override fun name(): String = "logging-post-processor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
log.info("Initializing plugin: ${name()}")
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
log.info("Shutting down plugin: ${name()}")
|
||||
}
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
log.info("Processing ${result.mimeType()} (${result.content().length} chars)")
|
||||
if (result.content().isEmpty()) {
|
||||
log.warning("Extraction resulted in empty content")
|
||||
}
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 10
|
||||
}
|
||||
|
||||
fun registerLoggingPostProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(LoggingPostProcessor())
|
||||
}
|
||||
```
|
||||
55
docs/snippets/kotlin/plugins/plugin_testing.md
Normal file
55
docs/snippets/kotlin/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,55 @@
|
||||
<!-- snippet:skip reason="kotlin.test is not on the snippet-runner classpath; the plugin-testing pattern documented here cannot compile under the runner's lightweight Kotlin profile. Run these tests from a real Gradle build." -->
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import kotlin.test.Test
|
||||
import kotlin.test.assertEquals
|
||||
import kotlin.test.assertFailsWith
|
||||
import kotlin.test.assertTrue
|
||||
|
||||
class MinLengthValidatorTest {
|
||||
|
||||
private fun makeResult(content: String): ExtractionResult =
|
||||
ExtractionResult.builder()
|
||||
.content(content)
|
||||
.mimeType("text/plain")
|
||||
.metadata(Metadata.builder().build())
|
||||
.tables(emptyList())
|
||||
.processingWarnings(emptyList())
|
||||
.build()
|
||||
|
||||
@Test
|
||||
fun `validate accepts content above minimum length`() {
|
||||
val validator = MinLengthValidator(minLength = 5)
|
||||
val result = makeResult("hello world")
|
||||
validator.validate(result, ExtractionConfig.builder().build())
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `validate rejects content below minimum length`() {
|
||||
val validator = MinLengthValidator(minLength = 100)
|
||||
val result = makeResult("too short")
|
||||
assertFailsWith<IllegalStateException> {
|
||||
validator.validate(result, ExtractionConfig.builder().build())
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `priority and name are stable`() {
|
||||
val validator = MinLengthValidator(minLength = 1)
|
||||
assertEquals("min-length-validator", validator.name())
|
||||
assertEquals(100, validator.priority())
|
||||
assertTrue(validator.should_validate(makeResult(""), ExtractionConfig.builder().build()))
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `registration round-trip exposes the plugin in the listing`() {
|
||||
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 1))
|
||||
try {
|
||||
assertTrue("min-length-validator" in Kreuzberg.listValidators())
|
||||
} finally {
|
||||
ValidatorBridge.unregisterValidator("min-length-validator")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
44
docs/snippets/kotlin/plugins/plugin_validator.md
Normal file
44
docs/snippets/kotlin/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
// Generic validator pattern: every IValidator has the same shape.
|
||||
// `name()` keys the registry, `priority()` orders execution (higher = earlier),
|
||||
// `should_validate()` is a fast skip-check, and `validate()` throws on failure.
|
||||
class GenericValidator(
|
||||
private val pluginName: String,
|
||||
private val pluginPriority: Int,
|
||||
private val check: (ExtractionResult, ExtractionConfig) -> Unit,
|
||||
) : IValidator {
|
||||
override fun name(): String = pluginName
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
// Optional: open resources, load config files, etc.
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
// Optional: release resources held in initialize().
|
||||
}
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
check(result, config)
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun priority(): Int = pluginPriority
|
||||
}
|
||||
|
||||
fun registerGenericValidator() {
|
||||
val validator = GenericValidator(
|
||||
pluginName = "non-empty-content",
|
||||
pluginPriority = 200,
|
||||
) { result, _ ->
|
||||
require(result.content().isNotBlank()) { "Extracted content is blank" }
|
||||
}
|
||||
ValidatorBridge.registerValidator(validator)
|
||||
}
|
||||
```
|
||||
28
docs/snippets/kotlin/plugins/quality_score_validator.md
Normal file
28
docs/snippets/kotlin/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class QualityScoreValidator(private val threshold: Double = 0.5) : IValidator {
|
||||
override fun name(): String = "quality-score-validator"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val score = result.qualityScore() ?: 0.0
|
||||
if (score < threshold) {
|
||||
throw IllegalStateException(
|
||||
"Quality score too low: %.2f < %.2f".format(score, threshold),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.qualityScore() != null
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerQualityScoreValidator() {
|
||||
ValidatorBridge.registerValidator(QualityScoreValidator(threshold = 0.5))
|
||||
}
|
||||
```
|
||||
47
docs/snippets/kotlin/plugins/stateful_plugin.md
Normal file
47
docs/snippets/kotlin/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.concurrent.ConcurrentHashMap
|
||||
import java.util.concurrent.atomic.AtomicLong
|
||||
|
||||
class StatefulPlugin : IPostProcessor {
|
||||
private val callCount = AtomicLong(0)
|
||||
private val cache: ConcurrentHashMap<String, String> = ConcurrentHashMap()
|
||||
|
||||
override fun name(): String = "stateful-plugin"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
callCount.set(0)
|
||||
cache.clear()
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
println("Plugin called ${callCount.get()} times")
|
||||
cache.clear()
|
||||
}
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val count = callCount.incrementAndGet()
|
||||
cache["last_mime"] = result.mimeType()
|
||||
cache["last_call"] = count.toString()
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
|
||||
fun callCount(): Long = callCount.get()
|
||||
fun lastMime(): String? = cache["last_mime"]
|
||||
}
|
||||
|
||||
fun registerStatefulPlugin() {
|
||||
PostProcessorBridge.registerPostProcessor(StatefulPlugin())
|
||||
}
|
||||
```
|
||||
12
docs/snippets/kotlin/plugins/unregister_plugins.md
Normal file
12
docs/snippets/kotlin/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
fun unregisterPlugins() {
|
||||
// Each plugin type has a static unregister helper on its bridge class.
|
||||
// The string argument is the name returned by the plugin's name() method.
|
||||
PostProcessorBridge.unregisterPostProcessor("word-count")
|
||||
ValidatorBridge.unregisterValidator("min-length-validator")
|
||||
OcrBackendBridge.unregisterOcrBackend("my-ocr-backend")
|
||||
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder")
|
||||
}
|
||||
```
|
||||
30
docs/snippets/kotlin/plugins/word_count_processor.md
Normal file
30
docs/snippets/kotlin/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class WordCountProcessor : IPostProcessor {
|
||||
override fun name(): String = "word-count"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val wordCount = result.content().split(Regex("\\s+")).count { it.isNotEmpty() }
|
||||
// ExtractionResult is an immutable record on the Java side; observe
|
||||
// and report rather than mutate.
|
||||
println("[word-count] ${result.mimeType()} -> $wordCount words")
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Early
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.content().isNotEmpty()
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerWordCountProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(WordCountProcessor())
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user