Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
fun clearAllPlugins() {
// Note: there is no Kreuzberg.clearDocumentExtractors() — extractor
// registration is not exposed through the Kotlin/Java plugin bridge.
Kreuzberg.clearPostProcessors()
Kreuzberg.clearOcrBackends()
Kreuzberg.clearValidators()
println("All post-processors, OCR backends, and validators cleared")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
// Wrap a host-language embedding model so kreuzberg can call back into it
// during chunking and standalone embed requests.
class MyEmbedder(private val dim: Long = 768L) : IEmbeddingBackend {
override fun name(): String = "my-embedder"
override fun version(): String = "1.0.0"
override fun dimensions(): Long = dim
override fun embed(texts: List<String>): List<List<Float>> {
// Replace this with a real model invocation. Each inner list must
// have exactly `dimensions()` elements — the bridge validates shape.
return texts.map { List(dim.toInt()) { 0.0f } }
}
}
fun registerMyEmbedder() {
EmbeddingBackendBridge.registerEmbeddingBackend(MyEmbedder())
}
```

View File

@@ -0,0 +1,21 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
// The Kotlin/Java plugin bridge does not expose an IDocumentExtractor interface
// — extractor registration lives in the Rust core. From Kotlin you can list
// the extractors that are already registered and route extraction through the
// existing facade.
fun useRegisteredExtractors() {
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
println("Available extractors: $extractors")
val config = ExtractionConfig.builder().build()
val result: ExtractionResult = Kreuzberg.extractFileSync(
java.nio.file.Path.of("document.pdf"),
null,
config,
)
println("Extracted ${result.content().length} characters via ${result.mimeType()}")
}
```

View File

@@ -0,0 +1,18 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
fun listAllPlugins() {
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
println("Registered extractors: $extractors")
val processors: List<String> = Kreuzberg.listPostProcessors()
println("Registered post-processors: $processors")
val backends: List<String> = Kreuzberg.listOcrBackends()
println("Registered OCR backends: $backends")
val validators: List<String> = Kreuzberg.listValidators()
println("Registered validators: $validators")
}
```

View File

@@ -0,0 +1,28 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class MinLengthValidator(private val minLength: Int) : IValidator {
override fun name(): String = "min-length-validator"
override fun version(): String = "1.0.0"
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
val length = result.content().length
if (length < minLength) {
throw IllegalStateException(
"Content too short: $length < $minLength characters",
)
}
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun priority(): Int = 100
}
fun registerMinLengthValidator() {
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 100))
}
```

View File

@@ -0,0 +1,38 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.concurrent.atomic.AtomicInteger
class PdfMetadataExtractor : IPostProcessor {
private val processed = AtomicInteger(0)
override fun name(): String = "pdf-metadata-extractor"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
if (result.mimeType() != "application/pdf") return
val count = processed.incrementAndGet()
val metadata: Metadata = result.metadata()
// Metadata is an immutable record — read PDF metadata fields rather
// than mutate. Reporting via stdout/log keeps the snippet honest.
println(
"[pdf-metadata] #$count title=${metadata.title()} authors=${metadata.authors()}",
)
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.mimeType() == "application/pdf"
override fun estimated_duration_ms(_result: ExtractionResult): Long = 2L
override fun priority(): Int = 25
}
fun registerPdfMetadataExtractor() {
PostProcessorBridge.registerPostProcessor(PdfMetadataExtractor())
}
```

View File

@@ -0,0 +1,30 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class PdfOnlyProcessor : IPostProcessor {
override fun name(): String = "pdf-only-processor"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
// Guard inside process() in addition to should_process() — the gate
// saves the JSON roundtrip when this returns false.
if (result.mimeType() != "application/pdf") return
println("[pdf-only] processing PDF (${result.content().length} chars)")
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.mimeType() == "application/pdf"
override fun estimated_duration_ms(_result: ExtractionResult): Long = 5L
override fun priority(): Int = 50
}
fun registerPdfOnlyProcessor() {
PostProcessorBridge.registerPostProcessor(PdfOnlyProcessor())
}
```

View File

@@ -0,0 +1,33 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import java.nio.file.Files
import java.nio.file.Path
// The Kotlin/Java bindings expose plugin bridges for IPostProcessor,
// IValidator, IOcrBackend, and IEmbeddingBackend. There is no
// IDocumentExtractor bridge — extractor selection happens entirely in the
// Rust core based on MIME type. From Kotlin, the "extractor plugin" pattern
// is to wrap Kreuzberg.extractBytes / extractFile and dispatch to the right
// extractor by MIME.
class GenericExtractorClient {
suspend fun extractBytes(
content: ByteArray,
mimeType: String,
config: ExtractionConfig = ExtractionConfig.builder().build(),
): ExtractionResult = Kreuzberg.extractBytes(content, mimeType, config)
suspend fun extractFile(
path: Path,
mimeType: String? = null,
config: ExtractionConfig = ExtractionConfig.builder().build(),
): ExtractionResult = Kreuzberg.extractFile(path, mimeType, config)
}
suspend fun extractCustomPayload() {
val client = GenericExtractorClient()
val bytes = Files.readAllBytes(Path.of("payload.json"))
val result = client.extractBytes(bytes, mimeType = "application/json")
println("Extracted ${result.content().length} chars")
}
```

View File

@@ -0,0 +1,41 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.logging.Logger
class LoggingPostProcessor : IPostProcessor {
private val log: Logger = Logger.getLogger(LoggingPostProcessor::class.java.name)
override fun name(): String = "logging-post-processor"
override fun version(): String = "1.0.0"
override fun initialize() {
log.info("Initializing plugin: ${name()}")
}
override fun shutdown() {
log.info("Shutting down plugin: ${name()}")
}
override fun process(result: ExtractionResult, config: ExtractionConfig) {
log.info("Processing ${result.mimeType()} (${result.content().length} chars)")
if (result.content().isEmpty()) {
log.warning("Extraction resulted in empty content")
}
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 10
}
fun registerLoggingPostProcessor() {
PostProcessorBridge.registerPostProcessor(LoggingPostProcessor())
}
```

View File

@@ -0,0 +1,55 @@
<!-- snippet:skip reason="kotlin.test is not on the snippet-runner classpath; the plugin-testing pattern documented here cannot compile under the runner's lightweight Kotlin profile. Run these tests from a real Gradle build." -->
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFailsWith
import kotlin.test.assertTrue
class MinLengthValidatorTest {
private fun makeResult(content: String): ExtractionResult =
ExtractionResult.builder()
.content(content)
.mimeType("text/plain")
.metadata(Metadata.builder().build())
.tables(emptyList())
.processingWarnings(emptyList())
.build()
@Test
fun `validate accepts content above minimum length`() {
val validator = MinLengthValidator(minLength = 5)
val result = makeResult("hello world")
validator.validate(result, ExtractionConfig.builder().build())
}
@Test
fun `validate rejects content below minimum length`() {
val validator = MinLengthValidator(minLength = 100)
val result = makeResult("too short")
assertFailsWith<IllegalStateException> {
validator.validate(result, ExtractionConfig.builder().build())
}
}
@Test
fun `priority and name are stable`() {
val validator = MinLengthValidator(minLength = 1)
assertEquals("min-length-validator", validator.name())
assertEquals(100, validator.priority())
assertTrue(validator.should_validate(makeResult(""), ExtractionConfig.builder().build()))
}
@Test
fun `registration round-trip exposes the plugin in the listing`() {
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 1))
try {
assertTrue("min-length-validator" in Kreuzberg.listValidators())
} finally {
ValidatorBridge.unregisterValidator("min-length-validator")
}
}
}
```

View File

@@ -0,0 +1,44 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
// Generic validator pattern: every IValidator has the same shape.
// `name()` keys the registry, `priority()` orders execution (higher = earlier),
// `should_validate()` is a fast skip-check, and `validate()` throws on failure.
class GenericValidator(
private val pluginName: String,
private val pluginPriority: Int,
private val check: (ExtractionResult, ExtractionConfig) -> Unit,
) : IValidator {
override fun name(): String = pluginName
override fun version(): String = "1.0.0"
override fun initialize() {
// Optional: open resources, load config files, etc.
}
override fun shutdown() {
// Optional: release resources held in initialize().
}
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
check(result, config)
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun priority(): Int = pluginPriority
}
fun registerGenericValidator() {
val validator = GenericValidator(
pluginName = "non-empty-content",
pluginPriority = 200,
) { result, _ ->
require(result.content().isNotBlank()) { "Extracted content is blank" }
}
ValidatorBridge.registerValidator(validator)
}
```

View File

@@ -0,0 +1,28 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class QualityScoreValidator(private val threshold: Double = 0.5) : IValidator {
override fun name(): String = "quality-score-validator"
override fun version(): String = "1.0.0"
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
val score = result.qualityScore() ?: 0.0
if (score < threshold) {
throw IllegalStateException(
"Quality score too low: %.2f < %.2f".format(score, threshold),
)
}
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.qualityScore() != null
override fun priority(): Int = 50
}
fun registerQualityScoreValidator() {
ValidatorBridge.registerValidator(QualityScoreValidator(threshold = 0.5))
}
```

View File

@@ -0,0 +1,47 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicLong
class StatefulPlugin : IPostProcessor {
private val callCount = AtomicLong(0)
private val cache: ConcurrentHashMap<String, String> = ConcurrentHashMap()
override fun name(): String = "stateful-plugin"
override fun version(): String = "1.0.0"
override fun initialize() {
callCount.set(0)
cache.clear()
}
override fun shutdown() {
println("Plugin called ${callCount.get()} times")
cache.clear()
}
override fun process(result: ExtractionResult, config: ExtractionConfig) {
val count = callCount.incrementAndGet()
cache["last_mime"] = result.mimeType()
cache["last_call"] = count.toString()
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 50
fun callCount(): Long = callCount.get()
fun lastMime(): String? = cache["last_mime"]
}
fun registerStatefulPlugin() {
PostProcessorBridge.registerPostProcessor(StatefulPlugin())
}
```

View File

@@ -0,0 +1,12 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
fun unregisterPlugins() {
// Each plugin type has a static unregister helper on its bridge class.
// The string argument is the name returned by the plugin's name() method.
PostProcessorBridge.unregisterPostProcessor("word-count")
ValidatorBridge.unregisterValidator("min-length-validator")
OcrBackendBridge.unregisterOcrBackend("my-ocr-backend")
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder")
}
```

View File

@@ -0,0 +1,30 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class WordCountProcessor : IPostProcessor {
override fun name(): String = "word-count"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
val wordCount = result.content().split(Regex("\\s+")).count { it.isNotEmpty() }
// ExtractionResult is an immutable record on the Java side; observe
// and report rather than mutate.
println("[word-count] ${result.mimeType()} -> $wordCount words")
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Early
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.content().isNotEmpty()
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 50
}
fun registerWordCountProcessor() {
PostProcessorBridge.registerPostProcessor(WordCountProcessor())
}
```