Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.build()
val pages = PageConfig.builder()
.withExtractPages(true)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.withPages(Optional.of(pages))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
for (chunk in result.chunks().orEmpty()) {
val first = chunk.metadata().firstPage()
val last = chunk.metadata().lastPage()
if (first != null && last != null) {
val pageRange = if (first == last) "Page $first" else "Pages $first-$last"
val preview = chunk.content().take(50)
println("Chunk: $preview... ($pageRange)")
}
}
}
```

View File

@@ -0,0 +1,60 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```
```kotlin title="Kotlin - Semantic"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withChunkerType(ChunkerType.Semantic)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```
```kotlin title="Kotlin - Prepend Heading Context"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withChunkerType(ChunkerType.Markdown)
.withPrependHeadingContext(true)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,35 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
for (chunk in result.chunks().orEmpty()) {
val metadata = chunk.metadata()
println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
val text = chunk.content()
val preview = text.take(100)
println("Content: $preview...")
chunk.embedding()?.let { embedding ->
println("Embedding: ${embedding.size} dimensions")
}
}
}
```

View File

@@ -0,0 +1,27 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.withBatchSize(32L)
.withShowDownloadProgress(false)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1024L)
.withOverlap(100L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.3f)
.withNgramRange(listOf(1L, 3L))
.withLanguage(Optional.of("en"))
.build()
val config = ExtractionConfig.builder()
.withKeywords(Optional.of(keywords))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Keywords: ${result.extractedKeywords()}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.3f)
.build()
val config = ExtractionConfig.builder()
.withKeywords(Optional.of(keywords))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
result.extractedKeywords()?.let { extracted ->
println("Keywords: $extracted")
}
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(false)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Detected languages: ${result.detectedLanguages()}")
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(true)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
println("Detected languages: ${result.detectedLanguages()}")
}
```

View File

@@ -0,0 +1,14 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Quality score: ${result.qualityScore()}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
val score = result.qualityScore()
if (score != null) {
if (score < 0.5) {
println("Warning: Low quality extraction (%.2f)".format(score))
} else {
println("Quality score: %.2f".format(score))
}
}
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Reduced content: ${result.content()}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
println("Reduced content length: ${result.content().length}")
}
```

View File

@@ -0,0 +1,52 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
data class VectorRecord(
val id: String,
val content: String,
val embedding: List<Float>,
val metadata: Map<String, String>
)
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.withBatchSize(32L)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(512L)
.withOverlap(50L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
val records = mutableListOf<VectorRecord>()
val chunks = result.chunks().orEmpty()
for ((index, chunk) in chunks.withIndex()) {
val vector = chunk.embedding()
if (vector != null) {
val metadata = mapOf(
"document_id" to documentId,
"chunk_index" to index.toString(),
"content_length" to chunk.content().length.toString()
)
records += VectorRecord(
id = "${documentId}_chunk_$index",
content = chunk.content(),
embedding = vector,
metadata = metadata
)
}
}
return records
}
```

View File

@@ -0,0 +1,16 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
fun main() {
val config = ExtractionConfig.builder().build()
val items = listOf(
BatchBytesItem("Hello, world!".toByteArray(), "text/plain", null),
BatchBytesItem("# Heading\n\nParagraph text.".toByteArray(), "text/markdown", null),
)
val results = Kreuzberg.batchExtractBytesSync(items, config)
results.forEachIndexed { index, result ->
println("Item $index: ${result.content().length} chars")
}
}
```

View File

@@ -0,0 +1,18 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val items = listOf(
BatchFileItem(Paths.get("doc1.pdf"), null),
BatchFileItem(Paths.get("doc2.docx"), null),
BatchFileItem(Paths.get("report.pdf"), null),
)
val results = Kreuzberg.batchExtractFilesSync(items, config)
results.forEachIndexed { index, result ->
println("File $index: ${result.content().length} chars")
}
}
```

View File

@@ -0,0 +1,31 @@
```kotlin title="Kotlin"
import java.net.URI
import java.net.http.HttpClient
import java.net.http.HttpRequest
import java.net.http.HttpResponse
fun main() {
val client = HttpClient.newHttpClient()
val json = """
{
"text": "Your long text here...",
"chunker_type": "text",
"config": {
"max_characters": 1000,
"overlap": 50,
"trim": true
}
}
""".trimIndent()
val request = HttpRequest.newBuilder()
.uri(URI.create("http://localhost:8000/chunk"))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(json))
.build()
val response = client.send(request, HttpResponse.BodyHandlers.ofString())
println("Status: ${response.statusCode()}")
println(response.body())
}
```

View File

@@ -0,0 +1,38 @@
```kotlin title="Kotlin"
import java.net.URI
import java.net.http.HttpClient
import java.net.http.HttpRequest
import java.net.http.HttpResponse
import java.nio.file.Files
import java.nio.file.Paths
fun main() {
val client = HttpClient.newHttpClient()
val path = Paths.get("document.pdf")
val bytes = Files.readAllBytes(path)
val fileName = path.fileName.toString()
val boundary = "----KreuzbergBoundary${System.currentTimeMillis()}"
val crlf = "\r\n"
val header = (
"--$boundary$crlf" +
"Content-Disposition: form-data; name=\"file\"; filename=\"$fileName\"$crlf" +
"Content-Type: application/pdf$crlf$crlf"
).toByteArray()
val footer = "$crlf--$boundary--$crlf".toByteArray()
val body = ByteArray(header.size + bytes.size + footer.size)
System.arraycopy(header, 0, body, 0, header.size)
System.arraycopy(bytes, 0, body, header.size, bytes.size)
System.arraycopy(footer, 0, body, header.size + bytes.size, footer.size)
val request = HttpRequest.newBuilder()
.uri(URI.create("http://localhost:8000/extract"))
.header("Content-Type", "multipart/form-data; boundary=$boundary")
.POST(HttpRequest.BodyPublishers.ofByteArray(body))
.build()
val response = client.send(request, HttpResponse.BodyHandlers.ofString())
println(response.body())
}
```

View File

@@ -0,0 +1,45 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(800L)
.withOverlap(100L)
.withChunkerType(ChunkerType.MARKDOWN)
.withPrependHeadingContext(true)
.build()
val images = ImageExtractionConfig.builder()
.withExtractImages(true)
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.withForceOcr(false)
.withChunking(Optional.of(chunking))
.withOutputFormat(OutputFormat.MARKDOWN)
.withIncludeDocumentStructure(true)
.withImages(Optional.of(images))
.withUseCache(true)
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("report.pdf"), null, config)
val content = result.content()
println("Content (${content.length} chars):")
println(content.take(200))
result.chunks()?.let { println("\nChunks: ${it.size}") }
println("Tables: ${result.tables()?.size ?: 0}")
result.detectedLanguages()?.let { println("Languages: $it") }
result.extractionMethod()?.let { println("Extraction method: $it") }
}
```

View File

@@ -0,0 +1,17 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
try {
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
} catch (e: KreuzbergRsException) {
System.err.println("Extraction failed: ${e.message}")
System.err.println("Error code: ${e.code}")
} catch (e: Exception) {
System.err.println("Unexpected error: ${e.message}")
}
}
```

View File

@@ -0,0 +1,28 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Files
import java.nio.file.Paths
fun extractText(bytes: ByteArray, mimeType: String): String {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractBytesSync(bytes, mimeType, config)
return result.content()
}
fun main() {
val bytes = try {
Files.readAllBytes(Paths.get("document.pdf"))
} catch (e: Exception) {
ByteArray(0)
}
try {
val text = extractText(bytes, "application/pdf")
println("Extracted ${text.length} chars")
} catch (e: KreuzbergRsException) {
System.err.println("Extraction error (code=${e.code}): ${e.message}")
} catch (e: Exception) {
System.err.println("Unexpected error: ${e.message}")
}
}
```

View File

@@ -0,0 +1,16 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlinx.coroutines.runBlocking
import java.nio.file.Files
import java.nio.file.Paths
fun main() = runBlocking {
val content = Files.readAllBytes(Paths.get("document.pdf"))
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractBytes(content, "application/pdf", config)
println(result.content())
println("Tables: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,14 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Files
import java.nio.file.Paths
fun main() {
val content = Files.readAllBytes(Paths.get("document.pdf"))
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractBytesSync(content, "application/pdf", config)
println(result.content())
println("Tables: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,15 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlinx.coroutines.runBlocking
import java.nio.file.Paths
fun main() = runBlocking {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFile(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
println("Tables: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,13 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
println("Tables: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,63 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withBatchSize(32L)
.withNormalize(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.withEmbedding(Optional.of(embedding))
.build()
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(false)
.build()
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.1f)
.withNgramRange(listOf(1L, 3L))
.withLanguage(Optional.of("en"))
.build()
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val postprocessor = PostProcessorConfig.builder()
.withEnabled(true)
.build()
val config = ExtractionConfig.builder()
.withUseCache(true)
.withEnableQualityProcessing(true)
.withOcr(Optional.of(ocr))
.withChunking(Optional.of(chunking))
.withLanguageDetection(Optional.of(languageDetection))
.withKeywords(Optional.of(keywords))
.withTokenReduction(Optional.of(tokenReduction))
.withPostprocessor(Optional.of(postprocessor))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Content: ${result.content()}")
result.detectedLanguages()?.let { println("Languages: $it") }
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,81 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val chunks = result.chunks().orEmpty()
println("Chunks: ${chunks.size}")
for (chunk in chunks) {
println("Length: ${chunk.content().length}")
}
}
```
```kotlin title="Kotlin - Markdown with Heading Context"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val sizing = ChunkSizing.Tokenizer("Xenova/gpt-4o", Optional.empty())
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withChunkerType(ChunkerType.Markdown)
.withSizing(sizing)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
for (chunk in result.chunks().orEmpty()) {
chunk.metadata()?.headingContext()?.let { ctx ->
for (heading in ctx.headings()) {
println("Heading L${heading.level()}: ${heading.text()}")
}
}
val text = chunk.content()
println("Content: ${text.take(100)}...")
}
}
```
```kotlin title="Kotlin - Prepend Heading Context"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withChunkerType(ChunkerType.Markdown)
.withPrependHeadingContext(true)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
for (chunk in result.chunks().orEmpty()) {
// Each chunk's content is prefixed with its heading breadcrumb
val text = chunk.content()
println("Content: ${text.take(100)}...")
}
}
```

View File

@@ -0,0 +1,15 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withUseCache(true)
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,17 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
// Java/Kotlin bindings construct configuration explicitly via the builder.
// Equivalent to ExtractionConfig::discover() in Rust: load defaults and override
// any fields you want to override.
val config = ExtractionConfig.builder()
.withUseCache(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println("Content length: ${result.content().length}")
println("Tables detected: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tesseract = TesseractConfig.builder()
.withPsm(6)
.build()
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng+deu")
.withTesseractConfig(Optional.of(tesseract))
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withUseCache(true)
.withOcr(Optional.of(ocr))
.withChunking(Optional.of(chunking))
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Content length: ${result.content().length}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withIncludeDocumentStructure(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val document = result.document()
if (document != null) {
for (node in document.nodes()) {
println(node)
}
}
}
```

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
// Configure element-based output (resultFormat controls Unified vs ElementBased)
val config = ExtractionConfig.builder()
.withResultFormat(ResultFormat.ElementBased)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val elements = result.elements().orEmpty()
for (element in elements) {
println("Type: ${element.elementType()}")
val text = element.text()
println("Text: ${text.take(100)}")
element.metadata().pageNumber()?.let { page ->
println("Page: $page")
}
println("---")
}
// Filter by element type
val titles = elements.filter { it.elementType() == ElementType.Title }
for (title in titles) {
println("Title: ${title.text()}")
}
}
```

View File

@@ -0,0 +1,27 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withBatchSize(16L)
.withNormalize(true)
.withShowDownloadProgress(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val htmlOutput = HtmlOutputConfig.builder()
.withTheme(HtmlTheme.GitHub)
.build()
val config = ExtractionConfig.builder()
.withOutputFormat(OutputFormat.Html)
.withHtmlOutput(Optional.of(htmlOutput))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content()) // HTML with kb-* classes
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.1f)
.withNgramRange(listOf(1L, 3L))
.withLanguage(Optional.of("en"))
.build()
val config = ExtractionConfig.builder()
.withKeywords(Optional.of(keywords))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Keywords: ${result.extractedKeywords()}")
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(true)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Detected languages: ${result.detectedLanguages()}")
}
```

View File

@@ -0,0 +1,23 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val images = ImageExtractionConfig.builder()
.withExtractImages(true)
.withTargetDpi(300)
.withMaxImageDimension(4096)
.withAutoAdjustDpi(true)
.withMinDpi(150)
.withMaxDpi(600)
.build()
val config = ExtractionConfig.builder()
.withImages(Optional.of(images))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted images: ${result.images()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val hierarchy = HierarchyConfig.builder()
.withEnabled(true)
.build()
val pdf = PdfConfig.builder()
.withExtractImages(true)
.withPasswords(Optional.of(listOf("password123")))
.withExtractMetadata(true)
.withHierarchy(Optional.of(hierarchy))
.build()
val config = ExtractionConfig.builder()
.withPdfOptions(Optional.of(pdf))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("encrypted.pdf"), null, config)
println("Title: ${result.metadata().title()}")
println("Authors: ${result.metadata().authors()}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val hierarchy = HierarchyConfig.builder()
.withEnabled(true)
.withKClusters(5L)
.withIncludeBbox(true)
.withOcrCoverageThreshold(Optional.of(0.8f))
.build()
val pdf = PdfConfig.builder()
.withHierarchy(Optional.of(hierarchy))
.build()
val config = ExtractionConfig.builder()
.withPdfOptions(Optional.of(pdf))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val pages = result.pages().orEmpty()
println("Pages: ${pages.size}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val postprocessor = PostProcessorConfig.builder()
.withEnabled(true)
.withEnabledProcessors(Optional.of(listOf(
"whitespace_normalizer",
"unicode_normalizer"
)))
.build()
val config = ExtractionConfig.builder()
.withPostprocessor(Optional.of(postprocessor))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Processed content: ${result.content()}")
}
```

View File

@@ -0,0 +1,16 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withEnableQualityProcessing(true)
.withUseCache(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Quality score: ${result.qualityScore()}")
println("Warnings: ${result.processingWarnings()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tesseract = TesseractConfig.builder()
.withLanguage("eng+deu")
.withPsm(6)
.withOem(3)
.build()
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng+deu")
.withTesseractConfig(Optional.of(tesseract))
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println("OCR text: ${result.content()}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Reduced content: ${result.content()}")
}
```

View File

@@ -0,0 +1,11 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
}
```

View File

@@ -0,0 +1,13 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
println("Tables: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,21 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.withForceOcr(true)
.build()
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println(result.content())
result.detectedLanguages()?.let { println("Detected languages: $it") }
}
```

View File

@@ -0,0 +1,11 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
println("Hello from Kreuzberg!")
val config = ExtractionConfig.builder().build()
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,8 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
fun main() {
val config = ExtractionConfig.builder().build()
println("Kreuzberg loaded: ${config != null}")
}
```

View File

@@ -0,0 +1,17 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
result.tables()?.forEachIndexed { index, table ->
println("Table ${index + 1}: ${table}")
}
result.chunks()?.forEachIndexed { index, chunk ->
println("Chunk ${index + 1}: ${chunk}")
}
}
```

View File

@@ -0,0 +1,38 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val schema = mapOf(
"type" to "object",
"properties" to mapOf(
"title" to mapOf("type" to "string"),
"authors" to mapOf("type" to "array", "items" to mapOf("type" to "string")),
"date" to mapOf("type" to "string")
),
"required" to listOf("title", "authors", "date"),
"additionalProperties" to false
)
val llm = LlmConfig.builder()
.withModel("openai/gpt-4o-mini")
.build()
val structured = StructuredExtractionConfig(
schema,
"document",
null,
true,
null,
llm
)
val config = ExtractionConfig.builder()
.withStructuredExtraction(Optional.of(structured))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("paper.pdf"), null, config)
result.structuredOutput()?.let { println(it) }
}
```

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.Optional
import java.io.BufferedReader
import java.io.BufferedWriter
import java.io.InputStreamReader
import java.io.OutputStreamWriter
fun main() {
val process = ProcessBuilder("kreuzberg", "mcp")
.redirectErrorStream(true)
.start()
val stdin = BufferedWriter(OutputStreamWriter(process.outputStream))
val stdout = BufferedReader(InputStreamReader(process.inputStream))
val request = """
{"method":"tools/call","params":{"name":"extract_file","arguments":{"path":"document.pdf","async":true}}}
""".trimIndent()
stdin.write(request)
stdin.newLine()
stdin.flush()
val response = stdout.readLine()
println(response)
stdin.close()
stdout.close()
process.destroy()
}
```

View File

@@ -0,0 +1,11 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.Optional
fun main() {
val process = ProcessBuilder("kreuzberg", "mcp")
.inheritIO()
.start()
process.waitFor()
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.9)
.withDetectMultiple(false)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Detected languages: ${result.detectedLanguages()}")
}
```

View File

@@ -0,0 +1,25 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(true)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
val detected = result.detectedLanguages() ?: emptyList()
println("Detected languages: $detected")
for (language in detected) {
println(" - $language")
}
}
```

View File

@@ -0,0 +1,60 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val metadata = result.metadata()
metadata.title()?.let { println("Title: $it") }
metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }
// Format-specific metadata via discriminated union
metadata.format()?.pdf()?.let { pdf ->
pdf.pageCount()?.let { println("Pages: $it") }
pdf.producer()?.let { println("Producer: $it") }
pdf.pdfVersion()?.let { println("PDF Version: $it") }
}
// Access HTML metadata
val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
htmlResult.metadata().format()?.html()?.let { html ->
html.title()?.let { println("Title: $it") }
html.description()?.let { println("Description: $it") }
html.canonicalUrl()?.let { println("Canonical URL: $it") }
html.language()?.let { println("Language: $it") }
// Access keywords list
println("Keywords: ${html.keywords()}")
// Open Graph fields are exposed as a Map<String, String>
html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
html.openGraph()["title"]?.let { println("Open Graph Title: $it") }
// Twitter Card fields as a Map<String, String>
html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }
// Headers
for (header in html.headers()) {
println("Header (level ${header.level()}): ${header.text()}")
}
// Links
for (link in html.links()) {
println("Link: ${link.href()} (${link.text()})")
}
// Images
for (image in html.images()) {
println("Image: ${image.src()}")
}
// Structured data
if (html.structuredData().isNotEmpty()) {
println("Structured data items: ${html.structuredData().size}")
}
}
}
```

View File

@@ -0,0 +1,24 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val pages = result.metadata().pages() ?: return
val boundaries = pages.boundaries() ?: return
val content = result.content()
for (boundary in boundaries.take(3)) {
val start = boundary.byteStart().toInt()
val end = boundary.byteEnd().toInt()
val pageText = content.substring(start, end)
val previewEnd = minOf(100, pageText.length)
println("Page ${boundary.pageNumber()}:")
println(" Byte range: $start-$end")
println(" Preview: ${pageText.substring(0, previewEnd)}...")
}
}
```

View File

@@ -0,0 +1,25 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val pageConfig = PageConfig.builder()
.withExtractPages(true)
.build()
val config = ExtractionConfig.builder()
.withPages(Optional.of(pageConfig))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val pages = result.pages() ?: return
for (page in pages) {
println("Page ${page.pageNumber()}:")
println(" Content: ${page.content().length} chars")
println(" Tables: ${page.tables().size}")
println(" Images: ${page.images().size}")
}
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val tables = result.tables() ?: emptyList()
for (table in tables) {
println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
println(table.markdown())
for (row in table.cells()) {
println(row)
}
}
}
```

View File

@@ -0,0 +1,57 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
data class VectorRecord(
val id: String,
val content: String,
val embedding: List<Float>,
val metadata: Map<String, String>,
)
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.withBatchSize(32L)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(512L)
.withOverlap(50L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
val records = mutableListOf<VectorRecord>()
val chunks = result.chunks() ?: return records
for ((index, chunk) in chunks.withIndex()) {
val vector = chunk.embedding() ?: continue
val metadata = mapOf(
"document_id" to documentId,
"chunk_index" to index.toString(),
"content_length" to chunk.content().length.toString(),
)
records.add(
VectorRecord(
id = "${documentId}_chunk_$index",
content = chunk.content(),
embedding = vector,
metadata = metadata,
)
)
}
return records
}
fun main() {
val records = extractAndVectorize("document.pdf", "doc-001")
println("Generated ${records.size} vector records")
}
```

View File

@@ -0,0 +1,45 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Path
class CloudOcrBackend(
private val apiKey: String,
private val supportedLangs: List<String>,
) : IOcrBackend {
override fun name(): String = "cloud-ocr"
override fun version(): String = "1.0.0"
override fun process_image(image_bytes: ByteArray, config: OcrConfig): ExtractionResult {
val text = callCloudApi(image_bytes, config.language())
return ExtractionResult.builder()
.withContent(text)
.withMimeType("text/plain")
.withMetadata(Metadata.builder().build())
.build()
}
override fun process_image_file(path: Path, config: OcrConfig): ExtractionResult {
return process_image(java.nio.file.Files.readAllBytes(path), config)
}
override fun supports_language(lang: String): Boolean = supportedLangs.contains(lang)
override fun backend_type(): OcrBackendType = OcrBackendType.Custom
override fun supported_languages(): List<String> = supportedLangs
override fun supports_table_detection(): Boolean = false
override fun supports_document_processing(): Boolean = false
override fun process_document(_path: Path, _config: OcrConfig): ExtractionResult {
throw UnsupportedOperationException("document processing not supported")
}
private fun callCloudApi(image: ByteArray, language: String): String {
return "Extracted text"
}
}
```

View File

@@ -0,0 +1,18 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val images = ImageExtractionConfig.builder()
.withExtractImages(true)
.build()
val config = ExtractionConfig.builder()
.withImages(Optional.of(images))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted images: ${result.images()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val images = ImageExtractionConfig.builder()
.withExtractImages(true)
.withTargetDpi(300)
.withMaxImageDimension(4096)
.build()
val config = ExtractionConfig.builder()
.withImages(Optional.of(images))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted images: ${result.images()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("easyocr")
.withLanguage("en")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted text: ${result.content()}")
}
```

View File

@@ -0,0 +1,31 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val elementConfig = OcrElementConfig.builder()
.withIncludeElements(true)
.build()
val ocr = OcrConfig.builder()
.withBackend("paddleocr")
.withLanguage("en")
.withElementConfig(Optional.of(elementConfig))
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
result.ocrElements()?.forEach { element ->
println("Text: ${element.text()}")
println("Confidence: ${element.confidence().recognition()}")
println("Geometry: ${element.geometry()}")
element.rotation()?.let { println("Rotation: ${it}") }
println()
}
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.withForceOcr(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng+deu")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("multilingual.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("paddleocr")
.withLanguage("en")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted text: ${result.content()}")
}
```

View File

@@ -0,0 +1,14 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
fun clearAllPlugins() {
// Note: there is no Kreuzberg.clearDocumentExtractors() — extractor
// registration is not exposed through the Kotlin/Java plugin bridge.
Kreuzberg.clearPostProcessors()
Kreuzberg.clearOcrBackends()
Kreuzberg.clearValidators()
println("All post-processors, OCR backends, and validators cleared")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
// Wrap a host-language embedding model so kreuzberg can call back into it
// during chunking and standalone embed requests.
class MyEmbedder(private val dim: Long = 768L) : IEmbeddingBackend {
override fun name(): String = "my-embedder"
override fun version(): String = "1.0.0"
override fun dimensions(): Long = dim
override fun embed(texts: List<String>): List<List<Float>> {
// Replace this with a real model invocation. Each inner list must
// have exactly `dimensions()` elements — the bridge validates shape.
return texts.map { List(dim.toInt()) { 0.0f } }
}
}
fun registerMyEmbedder() {
EmbeddingBackendBridge.registerEmbeddingBackend(MyEmbedder())
}
```

View File

@@ -0,0 +1,21 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
// The Kotlin/Java plugin bridge does not expose an IDocumentExtractor interface
// — extractor registration lives in the Rust core. From Kotlin you can list
// the extractors that are already registered and route extraction through the
// existing facade.
fun useRegisteredExtractors() {
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
println("Available extractors: $extractors")
val config = ExtractionConfig.builder().build()
val result: ExtractionResult = Kreuzberg.extractFileSync(
java.nio.file.Path.of("document.pdf"),
null,
config,
)
println("Extracted ${result.content().length} characters via ${result.mimeType()}")
}
```

View File

@@ -0,0 +1,18 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
fun listAllPlugins() {
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
println("Registered extractors: $extractors")
val processors: List<String> = Kreuzberg.listPostProcessors()
println("Registered post-processors: $processors")
val backends: List<String> = Kreuzberg.listOcrBackends()
println("Registered OCR backends: $backends")
val validators: List<String> = Kreuzberg.listValidators()
println("Registered validators: $validators")
}
```

View File

@@ -0,0 +1,28 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class MinLengthValidator(private val minLength: Int) : IValidator {
override fun name(): String = "min-length-validator"
override fun version(): String = "1.0.0"
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
val length = result.content().length
if (length < minLength) {
throw IllegalStateException(
"Content too short: $length < $minLength characters",
)
}
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun priority(): Int = 100
}
fun registerMinLengthValidator() {
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 100))
}
```

View File

@@ -0,0 +1,38 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.concurrent.atomic.AtomicInteger
class PdfMetadataExtractor : IPostProcessor {
private val processed = AtomicInteger(0)
override fun name(): String = "pdf-metadata-extractor"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
if (result.mimeType() != "application/pdf") return
val count = processed.incrementAndGet()
val metadata: Metadata = result.metadata()
// Metadata is an immutable record — read PDF metadata fields rather
// than mutate. Reporting via stdout/log keeps the snippet honest.
println(
"[pdf-metadata] #$count title=${metadata.title()} authors=${metadata.authors()}",
)
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.mimeType() == "application/pdf"
override fun estimated_duration_ms(_result: ExtractionResult): Long = 2L
override fun priority(): Int = 25
}
fun registerPdfMetadataExtractor() {
PostProcessorBridge.registerPostProcessor(PdfMetadataExtractor())
}
```

View File

@@ -0,0 +1,30 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class PdfOnlyProcessor : IPostProcessor {
override fun name(): String = "pdf-only-processor"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
// Guard inside process() in addition to should_process() — the gate
// saves the JSON roundtrip when this returns false.
if (result.mimeType() != "application/pdf") return
println("[pdf-only] processing PDF (${result.content().length} chars)")
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.mimeType() == "application/pdf"
override fun estimated_duration_ms(_result: ExtractionResult): Long = 5L
override fun priority(): Int = 50
}
fun registerPdfOnlyProcessor() {
PostProcessorBridge.registerPostProcessor(PdfOnlyProcessor())
}
```

View File

@@ -0,0 +1,33 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import java.nio.file.Files
import java.nio.file.Path
// The Kotlin/Java bindings expose plugin bridges for IPostProcessor,
// IValidator, IOcrBackend, and IEmbeddingBackend. There is no
// IDocumentExtractor bridge — extractor selection happens entirely in the
// Rust core based on MIME type. From Kotlin, the "extractor plugin" pattern
// is to wrap Kreuzberg.extractBytes / extractFile and dispatch to the right
// extractor by MIME.
class GenericExtractorClient {
suspend fun extractBytes(
content: ByteArray,
mimeType: String,
config: ExtractionConfig = ExtractionConfig.builder().build(),
): ExtractionResult = Kreuzberg.extractBytes(content, mimeType, config)
suspend fun extractFile(
path: Path,
mimeType: String? = null,
config: ExtractionConfig = ExtractionConfig.builder().build(),
): ExtractionResult = Kreuzberg.extractFile(path, mimeType, config)
}
suspend fun extractCustomPayload() {
val client = GenericExtractorClient()
val bytes = Files.readAllBytes(Path.of("payload.json"))
val result = client.extractBytes(bytes, mimeType = "application/json")
println("Extracted ${result.content().length} chars")
}
```

View File

@@ -0,0 +1,41 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.logging.Logger
class LoggingPostProcessor : IPostProcessor {
private val log: Logger = Logger.getLogger(LoggingPostProcessor::class.java.name)
override fun name(): String = "logging-post-processor"
override fun version(): String = "1.0.0"
override fun initialize() {
log.info("Initializing plugin: ${name()}")
}
override fun shutdown() {
log.info("Shutting down plugin: ${name()}")
}
override fun process(result: ExtractionResult, config: ExtractionConfig) {
log.info("Processing ${result.mimeType()} (${result.content().length} chars)")
if (result.content().isEmpty()) {
log.warning("Extraction resulted in empty content")
}
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 10
}
fun registerLoggingPostProcessor() {
PostProcessorBridge.registerPostProcessor(LoggingPostProcessor())
}
```

View File

@@ -0,0 +1,55 @@
<!-- snippet:skip reason="kotlin.test is not on the snippet-runner classpath; the plugin-testing pattern documented here cannot compile under the runner's lightweight Kotlin profile. Run these tests from a real Gradle build." -->
```kotlin title="Kotlin"
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFailsWith
import kotlin.test.assertTrue
class MinLengthValidatorTest {
private fun makeResult(content: String): ExtractionResult =
ExtractionResult.builder()
.content(content)
.mimeType("text/plain")
.metadata(Metadata.builder().build())
.tables(emptyList())
.processingWarnings(emptyList())
.build()
@Test
fun `validate accepts content above minimum length`() {
val validator = MinLengthValidator(minLength = 5)
val result = makeResult("hello world")
validator.validate(result, ExtractionConfig.builder().build())
}
@Test
fun `validate rejects content below minimum length`() {
val validator = MinLengthValidator(minLength = 100)
val result = makeResult("too short")
assertFailsWith<IllegalStateException> {
validator.validate(result, ExtractionConfig.builder().build())
}
}
@Test
fun `priority and name are stable`() {
val validator = MinLengthValidator(minLength = 1)
assertEquals("min-length-validator", validator.name())
assertEquals(100, validator.priority())
assertTrue(validator.should_validate(makeResult(""), ExtractionConfig.builder().build()))
}
@Test
fun `registration round-trip exposes the plugin in the listing`() {
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 1))
try {
assertTrue("min-length-validator" in Kreuzberg.listValidators())
} finally {
ValidatorBridge.unregisterValidator("min-length-validator")
}
}
}
```

View File

@@ -0,0 +1,44 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
// Generic validator pattern: every IValidator has the same shape.
// `name()` keys the registry, `priority()` orders execution (higher = earlier),
// `should_validate()` is a fast skip-check, and `validate()` throws on failure.
class GenericValidator(
private val pluginName: String,
private val pluginPriority: Int,
private val check: (ExtractionResult, ExtractionConfig) -> Unit,
) : IValidator {
override fun name(): String = pluginName
override fun version(): String = "1.0.0"
override fun initialize() {
// Optional: open resources, load config files, etc.
}
override fun shutdown() {
// Optional: release resources held in initialize().
}
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
check(result, config)
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun priority(): Int = pluginPriority
}
fun registerGenericValidator() {
val validator = GenericValidator(
pluginName = "non-empty-content",
pluginPriority = 200,
) { result, _ ->
require(result.content().isNotBlank()) { "Extracted content is blank" }
}
ValidatorBridge.registerValidator(validator)
}
```

View File

@@ -0,0 +1,28 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class QualityScoreValidator(private val threshold: Double = 0.5) : IValidator {
override fun name(): String = "quality-score-validator"
override fun version(): String = "1.0.0"
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
val score = result.qualityScore() ?: 0.0
if (score < threshold) {
throw IllegalStateException(
"Quality score too low: %.2f < %.2f".format(score, threshold),
)
}
}
override fun should_validate(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.qualityScore() != null
override fun priority(): Int = 50
}
fun registerQualityScoreValidator() {
ValidatorBridge.registerValidator(QualityScoreValidator(threshold = 0.5))
}
```

View File

@@ -0,0 +1,47 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicLong
class StatefulPlugin : IPostProcessor {
private val callCount = AtomicLong(0)
private val cache: ConcurrentHashMap<String, String> = ConcurrentHashMap()
override fun name(): String = "stateful-plugin"
override fun version(): String = "1.0.0"
override fun initialize() {
callCount.set(0)
cache.clear()
}
override fun shutdown() {
println("Plugin called ${callCount.get()} times")
cache.clear()
}
override fun process(result: ExtractionResult, config: ExtractionConfig) {
val count = callCount.incrementAndGet()
cache["last_mime"] = result.mimeType()
cache["last_call"] = count.toString()
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = true
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 50
fun callCount(): Long = callCount.get()
fun lastMime(): String? = cache["last_mime"]
}
fun registerStatefulPlugin() {
PostProcessorBridge.registerPostProcessor(StatefulPlugin())
}
```

View File

@@ -0,0 +1,12 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
fun unregisterPlugins() {
// Each plugin type has a static unregister helper on its bridge class.
// The string argument is the name returned by the plugin's name() method.
PostProcessorBridge.unregisterPostProcessor("word-count")
ValidatorBridge.unregisterValidator("min-length-validator")
OcrBackendBridge.unregisterOcrBackend("my-ocr-backend")
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder")
}
```

View File

@@ -0,0 +1,30 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
class WordCountProcessor : IPostProcessor {
override fun name(): String = "word-count"
override fun version(): String = "1.0.0"
override fun process(result: ExtractionResult, config: ExtractionConfig) {
val wordCount = result.content().split(Regex("\\s+")).count { it.isNotEmpty() }
// ExtractionResult is an immutable record on the Java side; observe
// and report rather than mutate.
println("[word-count] ${result.mimeType()} -> $wordCount words")
}
override fun processing_stage(): ProcessingStage = ProcessingStage.Early
override fun should_process(
_result: ExtractionResult,
_config: ExtractionConfig,
): Boolean = _result.content().isNotEmpty()
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
override fun priority(): Int = 50
}
fun registerWordCountProcessor() {
PostProcessorBridge.registerPostProcessor(WordCountProcessor())
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1500L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,35 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
for (chunk in result.chunks().orEmpty()) {
val metadata = chunk.metadata()
println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
val text = chunk.content()
val preview = text.take(100)
println("Content: $preview...")
chunk.embedding()?.let { vector ->
println("Embedding: ${vector.size} dimensions")
}
}
}
```

View File

@@ -0,0 +1,27 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.withBatchSize(32L)
.withShowDownloadProgress(false)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1024L)
.withOverlap(100L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.3f)
.build()
val config = ExtractionConfig.builder()
.withKeywords(Optional.of(keywords))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
result.extractedKeywords()?.let { extracted ->
println("Keywords: $extracted")
}
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
val score = result.qualityScore()
if (score != null) {
if (score < 0.5) {
println("Warning: Low quality extraction (%.2f)".format(score))
} else {
println("Quality score: %.2f".format(score))
}
}
}
```

View File

@@ -0,0 +1,17 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.util.Optional
fun main() {
val config = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.build()
val texts = listOf("Hello, world!", "Kreuzberg is fast")
val embeddings = Kreuzberg.embedTexts(texts, config)
println("Texts embedded: ${embeddings.size}")
println("Dimensions: ${embeddings[0].size}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
println("Reduced content length: ${result.content().length}")
}
```

View File

@@ -0,0 +1,52 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
data class VectorRecord(
val id: String,
val content: String,
val embedding: List<Float>,
val metadata: Map<String, String>
)
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withNormalize(true)
.withBatchSize(32L)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(512L)
.withOverlap(50L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
val records = mutableListOf<VectorRecord>()
val chunks = result.chunks().orEmpty()
for ((index, chunk) in chunks.withIndex()) {
val vector = chunk.embedding()
if (vector != null) {
val metadata = mapOf(
"document_id" to documentId,
"chunk_index" to index.toString(),
"content_length" to chunk.content().length.toString()
)
records += VectorRecord(
id = "${documentId}_chunk_$index",
content = chunk.content(),
embedding = vector,
metadata = metadata
)
}
}
return records
}
```