Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/kotlin/advanced/chunk_page_mapping.md
+++ b/docs/snippets/kotlin/advanced/chunk_page_mapping.md
@@ -0,0 +1,32 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .build()
+
+    val pages = PageConfig.builder()
+        .withExtractPages(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .withPages(Optional.of(pages))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    for (chunk in result.chunks().orEmpty()) {
+        val first = chunk.metadata().firstPage()
+        val last = chunk.metadata().lastPage()
+        if (first != null && last != null) {
+            val pageRange = if (first == last) "Page $first" else "Pages $first-$last"
+            val preview = chunk.content().take(50)
+            println("Chunk: $preview... ($pageRange)")
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/advanced/chunking_config.md
+++ b/docs/snippets/kotlin/advanced/chunking_config.md
@@ -0,0 +1,60 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1000L)
+        .withOverlap(200L)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks: ${result.chunks()?.size ?: 0}")
+}
+```
+
+```kotlin title="Kotlin - Semantic"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withChunkerType(ChunkerType.Semantic)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks: ${result.chunks()?.size ?: 0}")
+}
+```
+
+```kotlin title="Kotlin - Prepend Heading Context"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .withChunkerType(ChunkerType.Markdown)
+        .withPrependHeadingContext(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
+    println("Chunks: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/advanced/chunking_rag.md
+++ b/docs/snippets/kotlin/advanced/chunking_rag.md
@@ -0,0 +1,35 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
+    for (chunk in result.chunks().orEmpty()) {
+        val metadata = chunk.metadata()
+        println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
+        println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
+        val text = chunk.content()
+        val preview = text.take(100)
+        println("Content: $preview...")
+        chunk.embedding()?.let { embedding ->
+            println("Embedding: ${embedding.size} dimensions")
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/advanced/embedding_with_chunking.md
+++ b/docs/snippets/kotlin/advanced/embedding_with_chunking.md
@@ -0,0 +1,27 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .withBatchSize(32L)
+        .withShowDownloadProgress(false)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1024L)
+        .withOverlap(100L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/advanced/keyword_extraction_config.md
+++ b/docs/snippets/kotlin/advanced/keyword_extraction_config.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val keywords = KeywordConfig.builder()
+        .withAlgorithm(KeywordAlgorithm.Yake)
+        .withMaxKeywords(10L)
+        .withMinScore(0.3f)
+        .withNgramRange(listOf(1L, 3L))
+        .withLanguage(Optional.of("en"))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withKeywords(Optional.of(keywords))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Keywords: ${result.extractedKeywords()}")
+}
+```
--- a/docs/snippets/kotlin/advanced/keyword_extraction_example.md
+++ b/docs/snippets/kotlin/advanced/keyword_extraction_example.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val keywords = KeywordConfig.builder()
+        .withAlgorithm(KeywordAlgorithm.Yake)
+        .withMaxKeywords(10L)
+        .withMinScore(0.3f)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withKeywords(Optional.of(keywords))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
+    result.extractedKeywords()?.let { extracted ->
+        println("Keywords: $extracted")
+    }
+}
+```
--- a/docs/snippets/kotlin/advanced/language_detection_config.md
+++ b/docs/snippets/kotlin/advanced/language_detection_config.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.8)
+        .withDetectMultiple(false)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withLanguageDetection(Optional.of(languageDetection))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Detected languages: ${result.detectedLanguages()}")
+}
+```
--- a/docs/snippets/kotlin/advanced/language_detection_multilingual.md
+++ b/docs/snippets/kotlin/advanced/language_detection_multilingual.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.8)
+        .withDetectMultiple(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withLanguageDetection(Optional.of(languageDetection))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
+    println("Detected languages: ${result.detectedLanguages()}")
+}
+```
--- a/docs/snippets/kotlin/advanced/quality_processing_config.md
+++ b/docs/snippets/kotlin/advanced/quality_processing_config.md
@@ -0,0 +1,14 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Quality score: ${result.qualityScore()}")
+}
+```
--- a/docs/snippets/kotlin/advanced/quality_processing_example.md
+++ b/docs/snippets/kotlin/advanced/quality_processing_example.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
+
+    val score = result.qualityScore()
+    if (score != null) {
+        if (score < 0.5) {
+            println("Warning: Low quality extraction (%.2f)".format(score))
+        } else {
+            println("Quality score: %.2f".format(score))
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/advanced/token_reduction_config.md
+++ b/docs/snippets/kotlin/advanced/token_reduction_config.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withTokenReduction(Optional.of(tokenReduction))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Reduced content: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/advanced/token_reduction_example.md
+++ b/docs/snippets/kotlin/advanced/token_reduction_example.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withTokenReduction(Optional.of(tokenReduction))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
+    println("Reduced content length: ${result.content().length}")
+}
+```
--- a/docs/snippets/kotlin/advanced/vector_database_integration.md
+++ b/docs/snippets/kotlin/advanced/vector_database_integration.md
@@ -0,0 +1,52 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+data class VectorRecord(
+    val id: String,
+    val content: String,
+    val embedding: List<Float>,
+    val metadata: Map<String, String>
+)
+
+fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .withBatchSize(32L)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(512L)
+        .withOverlap(50L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
+
+    val records = mutableListOf<VectorRecord>()
+    val chunks = result.chunks().orEmpty()
+    for ((index, chunk) in chunks.withIndex()) {
+        val vector = chunk.embedding()
+        if (vector != null) {
+            val metadata = mapOf(
+                "document_id" to documentId,
+                "chunk_index" to index.toString(),
+                "content_length" to chunk.content().length.toString()
+            )
+            records += VectorRecord(
+                id = "${documentId}_chunk_$index",
+                content = chunk.content(),
+                embedding = vector,
+                metadata = metadata
+            )
+        }
+    }
+    return records
+}
+```
--- a/docs/snippets/kotlin/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/kotlin/api/batch_extract_bytes_sync.md
@@ -0,0 +1,16 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val items = listOf(
+        BatchBytesItem("Hello, world!".toByteArray(), "text/plain", null),
+        BatchBytesItem("# Heading\n\nParagraph text.".toByteArray(), "text/markdown", null),
+    )
+    val results = Kreuzberg.batchExtractBytesSync(items, config)
+
+    results.forEachIndexed { index, result ->
+        println("Item $index: ${result.content().length} chars")
+    }
+}
+```
--- a/docs/snippets/kotlin/api/batch_extract_files_sync.md
+++ b/docs/snippets/kotlin/api/batch_extract_files_sync.md
@@ -0,0 +1,18 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val items = listOf(
+        BatchFileItem(Paths.get("doc1.pdf"), null),
+        BatchFileItem(Paths.get("doc2.docx"), null),
+        BatchFileItem(Paths.get("report.pdf"), null),
+    )
+    val results = Kreuzberg.batchExtractFilesSync(items, config)
+
+    results.forEachIndexed { index, result ->
+        println("File $index: ${result.content().length} chars")
+    }
+}
+```
--- a/docs/snippets/kotlin/api/client_chunk_text.md
+++ b/docs/snippets/kotlin/api/client_chunk_text.md
@@ -0,0 +1,31 @@
+```kotlin title="Kotlin"
+import java.net.URI
+import java.net.http.HttpClient
+import java.net.http.HttpRequest
+import java.net.http.HttpResponse
+
+fun main() {
+    val client = HttpClient.newHttpClient()
+    val json = """
+        {
+          "text": "Your long text here...",
+          "chunker_type": "text",
+          "config": {
+            "max_characters": 1000,
+            "overlap": 50,
+            "trim": true
+          }
+        }
+    """.trimIndent()
+
+    val request = HttpRequest.newBuilder()
+        .uri(URI.create("http://localhost:8000/chunk"))
+        .header("Content-Type", "application/json")
+        .POST(HttpRequest.BodyPublishers.ofString(json))
+        .build()
+
+    val response = client.send(request, HttpResponse.BodyHandlers.ofString())
+    println("Status: ${response.statusCode()}")
+    println(response.body())
+}
+```
--- a/docs/snippets/kotlin/api/client_extract_single_file.md
+++ b/docs/snippets/kotlin/api/client_extract_single_file.md
@@ -0,0 +1,38 @@
+```kotlin title="Kotlin"
+import java.net.URI
+import java.net.http.HttpClient
+import java.net.http.HttpRequest
+import java.net.http.HttpResponse
+import java.nio.file.Files
+import java.nio.file.Paths
+
+fun main() {
+    val client = HttpClient.newHttpClient()
+    val path = Paths.get("document.pdf")
+    val bytes = Files.readAllBytes(path)
+    val fileName = path.fileName.toString()
+
+    val boundary = "----KreuzbergBoundary${System.currentTimeMillis()}"
+    val crlf = "\r\n"
+    val header = (
+        "--$boundary$crlf" +
+        "Content-Disposition: form-data; name=\"file\"; filename=\"$fileName\"$crlf" +
+        "Content-Type: application/pdf$crlf$crlf"
+    ).toByteArray()
+    val footer = "$crlf--$boundary--$crlf".toByteArray()
+
+    val body = ByteArray(header.size + bytes.size + footer.size)
+    System.arraycopy(header, 0, body, 0, header.size)
+    System.arraycopy(bytes, 0, body, header.size, bytes.size)
+    System.arraycopy(footer, 0, body, header.size + bytes.size, footer.size)
+
+    val request = HttpRequest.newBuilder()
+        .uri(URI.create("http://localhost:8000/extract"))
+        .header("Content-Type", "multipart/form-data; boundary=$boundary")
+        .POST(HttpRequest.BodyPublishers.ofByteArray(body))
+        .build()
+
+    val response = client.send(request, HttpResponse.BodyHandlers.ofString())
+    println(response.body())
+}
+```
--- a/docs/snippets/kotlin/api/combining_all_features.md
+++ b/docs/snippets/kotlin/api/combining_all_features.md
@@ -0,0 +1,45 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(800L)
+        .withOverlap(100L)
+        .withChunkerType(ChunkerType.MARKDOWN)
+        .withPrependHeadingContext(true)
+        .build()
+
+    val images = ImageExtractionConfig.builder()
+        .withExtractImages(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .withForceOcr(false)
+        .withChunking(Optional.of(chunking))
+        .withOutputFormat(OutputFormat.MARKDOWN)
+        .withIncludeDocumentStructure(true)
+        .withImages(Optional.of(images))
+        .withUseCache(true)
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("report.pdf"), null, config)
+
+    val content = result.content()
+    println("Content (${content.length} chars):")
+    println(content.take(200))
+
+    result.chunks()?.let { println("\nChunks: ${it.size}") }
+    println("Tables: ${result.tables()?.size ?: 0}")
+    result.detectedLanguages()?.let { println("Languages: $it") }
+    result.extractionMethod()?.let { println("Extraction method: $it") }
+}
+```
--- a/docs/snippets/kotlin/api/error_handling.md
+++ b/docs/snippets/kotlin/api/error_handling.md
@@ -0,0 +1,17 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    try {
+        val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+        println(result.content())
+    } catch (e: KreuzbergRsException) {
+        System.err.println("Extraction failed: ${e.message}")
+        System.err.println("Error code: ${e.code}")
+    } catch (e: Exception) {
+        System.err.println("Unexpected error: ${e.message}")
+    }
+}
+```
--- a/docs/snippets/kotlin/api/error_handling_extract.md
+++ b/docs/snippets/kotlin/api/error_handling_extract.md
@@ -0,0 +1,28 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Files
+import java.nio.file.Paths
+
+fun extractText(bytes: ByteArray, mimeType: String): String {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractBytesSync(bytes, mimeType, config)
+    return result.content()
+}
+
+fun main() {
+    val bytes = try {
+        Files.readAllBytes(Paths.get("document.pdf"))
+    } catch (e: Exception) {
+        ByteArray(0)
+    }
+
+    try {
+        val text = extractText(bytes, "application/pdf")
+        println("Extracted ${text.length} chars")
+    } catch (e: KreuzbergRsException) {
+        System.err.println("Extraction error (code=${e.code}): ${e.message}")
+    } catch (e: Exception) {
+        System.err.println("Unexpected error: ${e.message}")
+    }
+}
+```
--- a/docs/snippets/kotlin/api/extract_bytes_async.md
+++ b/docs/snippets/kotlin/api/extract_bytes_async.md
@@ -0,0 +1,16 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+import kotlinx.coroutines.runBlocking
+import java.nio.file.Files
+import java.nio.file.Paths
+
+fun main() = runBlocking {
+    val content = Files.readAllBytes(Paths.get("document.pdf"))
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractBytes(content, "application/pdf", config)
+
+    println(result.content())
+    println("Tables: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/api/extract_bytes_sync.md
+++ b/docs/snippets/kotlin/api/extract_bytes_sync.md
@@ -0,0 +1,14 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Files
+import java.nio.file.Paths
+
+fun main() {
+    val content = Files.readAllBytes(Paths.get("document.pdf"))
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractBytesSync(content, "application/pdf", config)
+
+    println(result.content())
+    println("Tables: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/api/extract_file_async.md
+++ b/docs/snippets/kotlin/api/extract_file_async.md
@@ -0,0 +1,15 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+import kotlinx.coroutines.runBlocking
+import java.nio.file.Paths
+
+fun main() = runBlocking {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractFile(Paths.get("document.pdf"), null, config)
+
+    println(result.content())
+    println("MIME type: ${result.mimeType()}")
+    println("Tables: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/api/extract_file_sync.md
+++ b/docs/snippets/kotlin/api/extract_file_sync.md
@@ -0,0 +1,13 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    println(result.content())
+    println("MIME type: ${result.mimeType()}")
+    println("Tables: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/advanced_config.md
+++ b/docs/snippets/kotlin/config/advanced_config.md
@@ -0,0 +1,63 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withBatchSize(32L)
+        .withNormalize(true)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1000L)
+        .withOverlap(200L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.8)
+        .withDetectMultiple(false)
+        .build()
+
+    val keywords = KeywordConfig.builder()
+        .withAlgorithm(KeywordAlgorithm.Yake)
+        .withMaxKeywords(10L)
+        .withMinScore(0.1f)
+        .withNgramRange(listOf(1L, 3L))
+        .withLanguage(Optional.of("en"))
+        .build()
+
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val postprocessor = PostProcessorConfig.builder()
+        .withEnabled(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withUseCache(true)
+        .withEnableQualityProcessing(true)
+        .withOcr(Optional.of(ocr))
+        .withChunking(Optional.of(chunking))
+        .withLanguageDetection(Optional.of(languageDetection))
+        .withKeywords(Optional.of(keywords))
+        .withTokenReduction(Optional.of(tokenReduction))
+        .withPostprocessor(Optional.of(postprocessor))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Content: ${result.content()}")
+    result.detectedLanguages()?.let { println("Languages: $it") }
+    println("Chunks: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/chunking_config.md
+++ b/docs/snippets/kotlin/config/chunking_config.md
@@ -0,0 +1,81 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1000L)
+        .withOverlap(200L)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    val chunks = result.chunks().orEmpty()
+    println("Chunks: ${chunks.size}")
+    for (chunk in chunks) {
+        println("Length: ${chunk.content().length}")
+    }
+}
+```
+
+```kotlin title="Kotlin - Markdown with Heading Context"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val sizing = ChunkSizing.Tokenizer("Xenova/gpt-4o", Optional.empty())
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .withChunkerType(ChunkerType.Markdown)
+        .withSizing(sizing)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
+    for (chunk in result.chunks().orEmpty()) {
+        chunk.metadata()?.headingContext()?.let { ctx ->
+            for (heading in ctx.headings()) {
+                println("Heading L${heading.level()}: ${heading.text()}")
+            }
+        }
+        val text = chunk.content()
+        println("Content: ${text.take(100)}...")
+    }
+}
+```
+
+```kotlin title="Kotlin - Prepend Heading Context"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .withChunkerType(ChunkerType.Markdown)
+        .withPrependHeadingContext(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
+    for (chunk in result.chunks().orEmpty()) {
+        // Each chunk's content is prefixed with its heading breadcrumb
+        val text = chunk.content()
+        println("Content: ${text.take(100)}...")
+    }
+}
+```
--- a/docs/snippets/kotlin/config/config_basic.md
+++ b/docs/snippets/kotlin/config/config_basic.md
@@ -0,0 +1,15 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withUseCache(true)
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/config/config_discover.md
+++ b/docs/snippets/kotlin/config/config_discover.md
@@ -0,0 +1,17 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    // Java/Kotlin bindings construct configuration explicitly via the builder.
+    // Equivalent to ExtractionConfig::discover() in Rust: load defaults and override
+    // any fields you want to override.
+    val config = ExtractionConfig.builder()
+        .withUseCache(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/config/config_ocr.md
+++ b/docs/snippets/kotlin/config/config_ocr.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
+    println("Content length: ${result.content().length}")
+    println("Tables detected: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/config_programmatic.md
+++ b/docs/snippets/kotlin/config/config_programmatic.md
@@ -0,0 +1,32 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tesseract = TesseractConfig.builder()
+        .withPsm(6)
+        .build()
+
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng+deu")
+        .withTesseractConfig(Optional.of(tesseract))
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1000L)
+        .withOverlap(200L)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withUseCache(true)
+        .withOcr(Optional.of(ocr))
+        .withChunking(Optional.of(chunking))
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Content length: ${result.content().length}")
+}
+```
--- a/docs/snippets/kotlin/config/document_structure_config.md
+++ b/docs/snippets/kotlin/config/document_structure_config.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withIncludeDocumentStructure(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    val document = result.document()
+    if (document != null) {
+        for (node in document.nodes()) {
+            println(node)
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/config/element_based_output.md
+++ b/docs/snippets/kotlin/config/element_based_output.md
@@ -0,0 +1,32 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    // Configure element-based output (resultFormat controls Unified vs ElementBased)
+    val config = ExtractionConfig.builder()
+        .withResultFormat(ResultFormat.ElementBased)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    val elements = result.elements().orEmpty()
+    for (element in elements) {
+        println("Type: ${element.elementType()}")
+        val text = element.text()
+        println("Text: ${text.take(100)}")
+
+        element.metadata().pageNumber()?.let { page ->
+            println("Page: $page")
+        }
+        println("---")
+    }
+
+    // Filter by element type
+    val titles = elements.filter { it.elementType() == ElementType.Title }
+    for (title in titles) {
+        println("Title: ${title.text()}")
+    }
+}
+```
--- a/docs/snippets/kotlin/config/embedding_config.md
+++ b/docs/snippets/kotlin/config/embedding_config.md
@@ -0,0 +1,27 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withBatchSize(16L)
+        .withNormalize(true)
+        .withShowDownloadProgress(true)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1000L)
+        .withOverlap(200L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/html_output.md
+++ b/docs/snippets/kotlin/config/html_output.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val htmlOutput = HtmlOutputConfig.builder()
+        .withTheme(HtmlTheme.GitHub)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOutputFormat(OutputFormat.Html)
+        .withHtmlOutput(Optional.of(htmlOutput))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content()) // HTML with kb-* classes
+}
+```
--- a/docs/snippets/kotlin/config/keyword_extraction_config.md
+++ b/docs/snippets/kotlin/config/keyword_extraction_config.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val keywords = KeywordConfig.builder()
+        .withAlgorithm(KeywordAlgorithm.Yake)
+        .withMaxKeywords(10L)
+        .withMinScore(0.1f)
+        .withNgramRange(listOf(1L, 3L))
+        .withLanguage(Optional.of("en"))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withKeywords(Optional.of(keywords))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Keywords: ${result.extractedKeywords()}")
+}
+```
--- a/docs/snippets/kotlin/config/language_detection_config.md
+++ b/docs/snippets/kotlin/config/language_detection_config.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.8)
+        .withDetectMultiple(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withLanguageDetection(Optional.of(languageDetection))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Detected languages: ${result.detectedLanguages()}")
+}
+```
--- a/docs/snippets/kotlin/config/ocr_dpi_config.md
+++ b/docs/snippets/kotlin/config/ocr_dpi_config.md
@@ -0,0 +1,23 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val images = ImageExtractionConfig.builder()
+        .withExtractImages(true)
+        .withTargetDpi(300)
+        .withMaxImageDimension(4096)
+        .withAutoAdjustDpi(true)
+        .withMinDpi(150)
+        .withMaxDpi(600)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withImages(Optional.of(images))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Extracted images: ${result.images()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/pdf_config.md
+++ b/docs/snippets/kotlin/config/pdf_config.md
@@ -0,0 +1,26 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val hierarchy = HierarchyConfig.builder()
+        .withEnabled(true)
+        .build()
+
+    val pdf = PdfConfig.builder()
+        .withExtractImages(true)
+        .withPasswords(Optional.of(listOf("password123")))
+        .withExtractMetadata(true)
+        .withHierarchy(Optional.of(hierarchy))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withPdfOptions(Optional.of(pdf))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("encrypted.pdf"), null, config)
+    println("Title: ${result.metadata().title()}")
+    println("Authors: ${result.metadata().authors()}")
+}
+```
--- a/docs/snippets/kotlin/config/pdf_hierarchy_config.md
+++ b/docs/snippets/kotlin/config/pdf_hierarchy_config.md
@@ -0,0 +1,26 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val hierarchy = HierarchyConfig.builder()
+        .withEnabled(true)
+        .withKClusters(5L)
+        .withIncludeBbox(true)
+        .withOcrCoverageThreshold(Optional.of(0.8f))
+        .build()
+
+    val pdf = PdfConfig.builder()
+        .withHierarchy(Optional.of(hierarchy))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withPdfOptions(Optional.of(pdf))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    val pages = result.pages().orEmpty()
+    println("Pages: ${pages.size}")
+}
+```
--- a/docs/snippets/kotlin/config/postprocessor_config.md
+++ b/docs/snippets/kotlin/config/postprocessor_config.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val postprocessor = PostProcessorConfig.builder()
+        .withEnabled(true)
+        .withEnabledProcessors(Optional.of(listOf(
+            "whitespace_normalizer",
+            "unicode_normalizer"
+        )))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withPostprocessor(Optional.of(postprocessor))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Processed content: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/config/quality_processing_config.md
+++ b/docs/snippets/kotlin/config/quality_processing_config.md
@@ -0,0 +1,16 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withEnableQualityProcessing(true)
+        .withUseCache(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Quality score: ${result.qualityScore()}")
+    println("Warnings: ${result.processingWarnings()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/config/tesseract_config.md
+++ b/docs/snippets/kotlin/config/tesseract_config.md
@@ -0,0 +1,26 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tesseract = TesseractConfig.builder()
+        .withLanguage("eng+deu")
+        .withPsm(6)
+        .withOem(3)
+        .build()
+
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng+deu")
+        .withTesseractConfig(Optional.of(tesseract))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
+    println("OCR text: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/config/token_reduction_config.md
+++ b/docs/snippets/kotlin/config/token_reduction_config.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withTokenReduction(Optional.of(tokenReduction))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Reduced content: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/getting-started/basic_usage.md
+++ b/docs/snippets/kotlin/getting-started/basic_usage.md
@@ -0,0 +1,11 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+    println("MIME type: ${result.mimeType()}")
+}
+```
--- a/docs/snippets/kotlin/getting-started/extract_file.md
+++ b/docs/snippets/kotlin/getting-started/extract_file.md
@@ -0,0 +1,13 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    println(result.content())
+    println("MIME type: ${result.mimeType()}")
+    println("Tables: ${result.tables()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/getting-started/extract_with_ocr.md
+++ b/docs/snippets/kotlin/getting-started/extract_with_ocr.md
@@ -0,0 +1,21 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .withForceOcr(true)
+        .build()
+
+    val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
+    println(result.content())
+    result.detectedLanguages()?.let { println("Detected languages: $it") }
+}
+```
--- a/docs/snippets/kotlin/getting-started/hello_world.md
+++ b/docs/snippets/kotlin/getting-started/hello_world.md
@@ -0,0 +1,11 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    println("Hello from Kreuzberg!")
+    val config = ExtractionConfig.builder().build()
+    val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/getting-started/install_verify.md
+++ b/docs/snippets/kotlin/getting-started/install_verify.md
@@ -0,0 +1,8 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    println("Kreuzberg loaded: ${config != null}")
+}
+```
--- a/docs/snippets/kotlin/getting-started/read_content.md
+++ b/docs/snippets/kotlin/getting-started/read_content.md
@@ -0,0 +1,17 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    result.tables()?.forEachIndexed { index, table ->
+        println("Table ${index + 1}: ${table}")
+    }
+
+    result.chunks()?.forEachIndexed { index, chunk ->
+        println("Chunk ${index + 1}: ${chunk}")
+    }
+}
+```
--- a/docs/snippets/kotlin/llm/structured_extraction.md
+++ b/docs/snippets/kotlin/llm/structured_extraction.md
@@ -0,0 +1,38 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val schema = mapOf(
+        "type" to "object",
+        "properties" to mapOf(
+            "title" to mapOf("type" to "string"),
+            "authors" to mapOf("type" to "array", "items" to mapOf("type" to "string")),
+            "date" to mapOf("type" to "string")
+        ),
+        "required" to listOf("title", "authors", "date"),
+        "additionalProperties" to false
+    )
+
+    val llm = LlmConfig.builder()
+        .withModel("openai/gpt-4o-mini")
+        .build()
+
+    val structured = StructuredExtractionConfig(
+        schema,
+        "document",
+        null,
+        true,
+        null,
+        llm
+    )
+
+    val config = ExtractionConfig.builder()
+        .withStructuredExtraction(Optional.of(structured))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("paper.pdf"), null, config)
+    result.structuredOutput()?.let { println(it) }
+}
+```
--- a/docs/snippets/kotlin/mcp/mcp_custom_client.md
+++ b/docs/snippets/kotlin/mcp/mcp_custom_client.md
@@ -0,0 +1,32 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.Optional
+import java.io.BufferedReader
+import java.io.BufferedWriter
+import java.io.InputStreamReader
+import java.io.OutputStreamWriter
+
+fun main() {
+    val process = ProcessBuilder("kreuzberg", "mcp")
+        .redirectErrorStream(true)
+        .start()
+
+    val stdin = BufferedWriter(OutputStreamWriter(process.outputStream))
+    val stdout = BufferedReader(InputStreamReader(process.inputStream))
+
+    val request = """
+        {"method":"tools/call","params":{"name":"extract_file","arguments":{"path":"document.pdf","async":true}}}
+    """.trimIndent()
+
+    stdin.write(request)
+    stdin.newLine()
+    stdin.flush()
+
+    val response = stdout.readLine()
+    println(response)
+
+    stdin.close()
+    stdout.close()
+    process.destroy()
+}
+```
--- a/docs/snippets/kotlin/mcp/mcp_server_start.md
+++ b/docs/snippets/kotlin/mcp/mcp_server_start.md
@@ -0,0 +1,11 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.Optional
+
+fun main() {
+    val process = ProcessBuilder("kreuzberg", "mcp")
+        .inheritIO()
+        .start()
+    process.waitFor()
+}
+```
--- a/docs/snippets/kotlin/metadata/language_detection.md
+++ b/docs/snippets/kotlin/metadata/language_detection.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.9)
+        .withDetectMultiple(false)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withLanguageDetection(Optional.of(languageDetection))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Detected languages: ${result.detectedLanguages()}")
+}
+```
--- a/docs/snippets/kotlin/metadata/language_detection_multilingual.md
+++ b/docs/snippets/kotlin/metadata/language_detection_multilingual.md
@@ -0,0 +1,25 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val languageDetection = LanguageDetectionConfig.builder()
+        .withEnabled(true)
+        .withMinConfidence(0.8)
+        .withDetectMultiple(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withLanguageDetection(Optional.of(languageDetection))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
+
+    val detected = result.detectedLanguages() ?: emptyList()
+    println("Detected languages: $detected")
+    for (language in detected) {
+        println("  - $language")
+    }
+}
+```
--- a/docs/snippets/kotlin/metadata/metadata.md
+++ b/docs/snippets/kotlin/metadata/metadata.md
@@ -0,0 +1,60 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    val metadata = result.metadata()
+    metadata.title()?.let { println("Title: $it") }
+    metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }
+
+    // Format-specific metadata via discriminated union
+    metadata.format()?.pdf()?.let { pdf ->
+        pdf.pageCount()?.let { println("Pages: $it") }
+        pdf.producer()?.let { println("Producer: $it") }
+        pdf.pdfVersion()?.let { println("PDF Version: $it") }
+    }
+
+    // Access HTML metadata
+    val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
+    htmlResult.metadata().format()?.html()?.let { html ->
+        html.title()?.let { println("Title: $it") }
+        html.description()?.let { println("Description: $it") }
+        html.canonicalUrl()?.let { println("Canonical URL: $it") }
+        html.language()?.let { println("Language: $it") }
+
+        // Access keywords list
+        println("Keywords: ${html.keywords()}")
+
+        // Open Graph fields are exposed as a Map<String, String>
+        html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
+        html.openGraph()["title"]?.let { println("Open Graph Title: $it") }
+
+        // Twitter Card fields as a Map<String, String>
+        html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }
+
+        // Headers
+        for (header in html.headers()) {
+            println("Header (level ${header.level()}): ${header.text()}")
+        }
+
+        // Links
+        for (link in html.links()) {
+            println("Link: ${link.href()} (${link.text()})")
+        }
+
+        // Images
+        for (image in html.images()) {
+            println("Image: ${image.src()}")
+        }
+
+        // Structured data
+        if (html.structuredData().isNotEmpty()) {
+            println("Structured data items: ${html.structuredData().size}")
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/metadata/page_boundaries.md
+++ b/docs/snippets/kotlin/metadata/page_boundaries.md
@@ -0,0 +1,24 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    val pages = result.metadata().pages() ?: return
+    val boundaries = pages.boundaries() ?: return
+
+    val content = result.content()
+    for (boundary in boundaries.take(3)) {
+        val start = boundary.byteStart().toInt()
+        val end = boundary.byteEnd().toInt()
+        val pageText = content.substring(start, end)
+        val previewEnd = minOf(100, pageText.length)
+
+        println("Page ${boundary.pageNumber()}:")
+        println("  Byte range: $start-$end")
+        println("  Preview: ${pageText.substring(0, previewEnd)}...")
+    }
+}
+```
--- a/docs/snippets/kotlin/metadata/page_tracking_basic.md
+++ b/docs/snippets/kotlin/metadata/page_tracking_basic.md
@@ -0,0 +1,25 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val pageConfig = PageConfig.builder()
+        .withExtractPages(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withPages(Optional.of(pageConfig))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    val pages = result.pages() ?: return
+    for (page in pages) {
+        println("Page ${page.pageNumber()}:")
+        println("  Content: ${page.content().length} chars")
+        println("  Tables: ${page.tables().size}")
+        println("  Images: ${page.images().size}")
+    }
+}
+```
--- a/docs/snippets/kotlin/metadata/tables.md
+++ b/docs/snippets/kotlin/metadata/tables.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+
+fun main() {
+    val config = ExtractionConfig.builder().build()
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+
+    val tables = result.tables() ?: emptyList()
+    for (table in tables) {
+        println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
+        println(table.markdown())
+
+        for (row in table.cells()) {
+            println(row)
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/metadata/vector_database_integration.md
+++ b/docs/snippets/kotlin/metadata/vector_database_integration.md
@@ -0,0 +1,57 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+data class VectorRecord(
+    val id: String,
+    val content: String,
+    val embedding: List<Float>,
+    val metadata: Map<String, String>,
+)
+
+fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .withBatchSize(32L)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(512L)
+        .withOverlap(50L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
+
+    val records = mutableListOf<VectorRecord>()
+    val chunks = result.chunks() ?: return records
+    for ((index, chunk) in chunks.withIndex()) {
+        val vector = chunk.embedding() ?: continue
+        val metadata = mapOf(
+            "document_id" to documentId,
+            "chunk_index" to index.toString(),
+            "content_length" to chunk.content().length.toString(),
+        )
+        records.add(
+            VectorRecord(
+                id = "${documentId}_chunk_$index",
+                content = chunk.content(),
+                embedding = vector,
+                metadata = metadata,
+            )
+        )
+    }
+    return records
+}
+
+fun main() {
+    val records = extractAndVectorize("document.pdf", "doc-001")
+    println("Generated ${records.size} vector records")
+}
+```
--- a/docs/snippets/kotlin/ocr/cloud_ocr_backend.md
+++ b/docs/snippets/kotlin/ocr/cloud_ocr_backend.md
@@ -0,0 +1,45 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Path
+
+class CloudOcrBackend(
+    private val apiKey: String,
+    private val supportedLangs: List<String>,
+) : IOcrBackend {
+
+    override fun name(): String = "cloud-ocr"
+
+    override fun version(): String = "1.0.0"
+
+    override fun process_image(image_bytes: ByteArray, config: OcrConfig): ExtractionResult {
+        val text = callCloudApi(image_bytes, config.language())
+        return ExtractionResult.builder()
+            .withContent(text)
+            .withMimeType("text/plain")
+            .withMetadata(Metadata.builder().build())
+            .build()
+    }
+
+    override fun process_image_file(path: Path, config: OcrConfig): ExtractionResult {
+        return process_image(java.nio.file.Files.readAllBytes(path), config)
+    }
+
+    override fun supports_language(lang: String): Boolean = supportedLangs.contains(lang)
+
+    override fun backend_type(): OcrBackendType = OcrBackendType.Custom
+
+    override fun supported_languages(): List<String> = supportedLangs
+
+    override fun supports_table_detection(): Boolean = false
+
+    override fun supports_document_processing(): Boolean = false
+
+    override fun process_document(_path: Path, _config: OcrConfig): ExtractionResult {
+        throw UnsupportedOperationException("document processing not supported")
+    }
+
+    private fun callCloudApi(image: ByteArray, language: String): String {
+        return "Extracted text"
+    }
+}
+```
--- a/docs/snippets/kotlin/ocr/image_extraction.md
+++ b/docs/snippets/kotlin/ocr/image_extraction.md
@@ -0,0 +1,18 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val images = ImageExtractionConfig.builder()
+        .withExtractImages(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withImages(Optional.of(images))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Extracted images: ${result.images()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/ocr/image_preprocessing.md
+++ b/docs/snippets/kotlin/ocr/image_preprocessing.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val images = ImageExtractionConfig.builder()
+        .withExtractImages(true)
+        .withTargetDpi(300)
+        .withMaxImageDimension(4096)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withImages(Optional.of(images))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Extracted images: ${result.images()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_easyocr.md
+++ b/docs/snippets/kotlin/ocr/ocr_easyocr.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("easyocr")
+        .withLanguage("en")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Extracted text: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_elements.md
+++ b/docs/snippets/kotlin/ocr/ocr_elements.md
@@ -0,0 +1,31 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val elementConfig = OcrElementConfig.builder()
+        .withIncludeElements(true)
+        .build()
+
+    val ocr = OcrConfig.builder()
+        .withBackend("paddleocr")
+        .withLanguage("en")
+        .withElementConfig(Optional.of(elementConfig))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
+
+    result.ocrElements()?.forEach { element ->
+        println("Text: ${element.text()}")
+        println("Confidence: ${element.confidence().recognition()}")
+        println("Geometry: ${element.geometry()}")
+        element.rotation()?.let { println("Rotation: ${it}") }
+        println()
+    }
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_extraction.md
+++ b/docs/snippets/kotlin/ocr/ocr_extraction.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_force_all_pages.md
+++ b/docs/snippets/kotlin/ocr/ocr_force_all_pages.md
@@ -0,0 +1,20 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .withForceOcr(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_multi_language.md
+++ b/docs/snippets/kotlin/ocr/ocr_multi_language.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("tesseract")
+        .withLanguage("eng+deu")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("multilingual.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/ocr/ocr_paddleocr.md
+++ b/docs/snippets/kotlin/ocr/ocr_paddleocr.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val ocr = OcrConfig.builder()
+        .withBackend("paddleocr")
+        .withLanguage("en")
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withOcr(Optional.of(ocr))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Extracted text: ${result.content()}")
+}
+```
--- a/docs/snippets/kotlin/plugins/clear_plugins.md
+++ b/docs/snippets/kotlin/plugins/clear_plugins.md
@@ -0,0 +1,14 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+
+fun clearAllPlugins() {
+    // Note: there is no Kreuzberg.clearDocumentExtractors() — extractor
+    // registration is not exposed through the Kotlin/Java plugin bridge.
+    Kreuzberg.clearPostProcessors()
+    Kreuzberg.clearOcrBackends()
+    Kreuzberg.clearValidators()
+
+    println("All post-processors, OCR backends, and validators cleared")
+}
+```
--- a/docs/snippets/kotlin/plugins/embedding_backend.md
+++ b/docs/snippets/kotlin/plugins/embedding_backend.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+// Wrap a host-language embedding model so kreuzberg can call back into it
+// during chunking and standalone embed requests.
+class MyEmbedder(private val dim: Long = 768L) : IEmbeddingBackend {
+    override fun name(): String = "my-embedder"
+    override fun version(): String = "1.0.0"
+
+    override fun dimensions(): Long = dim
+
+    override fun embed(texts: List<String>): List<List<Float>> {
+        // Replace this with a real model invocation. Each inner list must
+        // have exactly `dimensions()` elements — the bridge validates shape.
+        return texts.map { List(dim.toInt()) { 0.0f } }
+    }
+}
+
+fun registerMyEmbedder() {
+    EmbeddingBackendBridge.registerEmbeddingBackend(MyEmbedder())
+}
+```
--- a/docs/snippets/kotlin/plugins/extractor_registration.md
+++ b/docs/snippets/kotlin/plugins/extractor_registration.md
@@ -0,0 +1,21 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+
+// The Kotlin/Java plugin bridge does not expose an IDocumentExtractor interface
+// — extractor registration lives in the Rust core. From Kotlin you can list
+// the extractors that are already registered and route extraction through the
+// existing facade.
+fun useRegisteredExtractors() {
+    val extractors: List<String> = Kreuzberg.listDocumentExtractors()
+    println("Available extractors: $extractors")
+
+    val config = ExtractionConfig.builder().build()
+    val result: ExtractionResult = Kreuzberg.extractFileSync(
+        java.nio.file.Path.of("document.pdf"),
+        null,
+        config,
+    )
+    println("Extracted ${result.content().length} characters via ${result.mimeType()}")
+}
+```
--- a/docs/snippets/kotlin/plugins/list_plugins.md
+++ b/docs/snippets/kotlin/plugins/list_plugins.md
@@ -0,0 +1,18 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+
+fun listAllPlugins() {
+    val extractors: List<String> = Kreuzberg.listDocumentExtractors()
+    println("Registered extractors: $extractors")
+
+    val processors: List<String> = Kreuzberg.listPostProcessors()
+    println("Registered post-processors: $processors")
+
+    val backends: List<String> = Kreuzberg.listOcrBackends()
+    println("Registered OCR backends: $backends")
+
+    val validators: List<String> = Kreuzberg.listValidators()
+    println("Registered validators: $validators")
+}
+```
--- a/docs/snippets/kotlin/plugins/min_length_validator.md
+++ b/docs/snippets/kotlin/plugins/min_length_validator.md
@@ -0,0 +1,28 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+class MinLengthValidator(private val minLength: Int) : IValidator {
+    override fun name(): String = "min-length-validator"
+    override fun version(): String = "1.0.0"
+
+    override fun validate(result: ExtractionResult, config: ExtractionConfig) {
+        val length = result.content().length
+        if (length < minLength) {
+            throw IllegalStateException(
+                "Content too short: $length < $minLength characters",
+            )
+        }
+    }
+
+    override fun should_validate(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = true
+
+    override fun priority(): Int = 100
+}
+
+fun registerMinLengthValidator() {
+    ValidatorBridge.registerValidator(MinLengthValidator(minLength = 100))
+}
+```
--- a/docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
+++ b/docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
@@ -0,0 +1,38 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.concurrent.atomic.AtomicInteger
+
+class PdfMetadataExtractor : IPostProcessor {
+    private val processed = AtomicInteger(0)
+
+    override fun name(): String = "pdf-metadata-extractor"
+    override fun version(): String = "1.0.0"
+
+    override fun process(result: ExtractionResult, config: ExtractionConfig) {
+        if (result.mimeType() != "application/pdf") return
+
+        val count = processed.incrementAndGet()
+        val metadata: Metadata = result.metadata()
+        // Metadata is an immutable record — read PDF metadata fields rather
+        // than mutate. Reporting via stdout/log keeps the snippet honest.
+        println(
+            "[pdf-metadata] #$count title=${metadata.title()} authors=${metadata.authors()}",
+        )
+    }
+
+    override fun processing_stage(): ProcessingStage = ProcessingStage.Late
+
+    override fun should_process(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = _result.mimeType() == "application/pdf"
+
+    override fun estimated_duration_ms(_result: ExtractionResult): Long = 2L
+
+    override fun priority(): Int = 25
+}
+
+fun registerPdfMetadataExtractor() {
+    PostProcessorBridge.registerPostProcessor(PdfMetadataExtractor())
+}
+```
--- a/docs/snippets/kotlin/plugins/pdf_only_processor.md
+++ b/docs/snippets/kotlin/plugins/pdf_only_processor.md
@@ -0,0 +1,30 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+class PdfOnlyProcessor : IPostProcessor {
+    override fun name(): String = "pdf-only-processor"
+    override fun version(): String = "1.0.0"
+
+    override fun process(result: ExtractionResult, config: ExtractionConfig) {
+        // Guard inside process() in addition to should_process() — the gate
+        // saves the JSON roundtrip when this returns false.
+        if (result.mimeType() != "application/pdf") return
+        println("[pdf-only] processing PDF (${result.content().length} chars)")
+    }
+
+    override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
+
+    override fun should_process(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = _result.mimeType() == "application/pdf"
+
+    override fun estimated_duration_ms(_result: ExtractionResult): Long = 5L
+
+    override fun priority(): Int = 50
+}
+
+fun registerPdfOnlyProcessor() {
+    PostProcessorBridge.registerPostProcessor(PdfOnlyProcessor())
+}
+```
--- a/docs/snippets/kotlin/plugins/plugin_extractor.md
+++ b/docs/snippets/kotlin/plugins/plugin_extractor.md
@@ -0,0 +1,33 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+import java.nio.file.Files
+import java.nio.file.Path
+
+// The Kotlin/Java bindings expose plugin bridges for IPostProcessor,
+// IValidator, IOcrBackend, and IEmbeddingBackend. There is no
+// IDocumentExtractor bridge — extractor selection happens entirely in the
+// Rust core based on MIME type. From Kotlin, the "extractor plugin" pattern
+// is to wrap Kreuzberg.extractBytes / extractFile and dispatch to the right
+// extractor by MIME.
+class GenericExtractorClient {
+    suspend fun extractBytes(
+        content: ByteArray,
+        mimeType: String,
+        config: ExtractionConfig = ExtractionConfig.builder().build(),
+    ): ExtractionResult = Kreuzberg.extractBytes(content, mimeType, config)
+
+    suspend fun extractFile(
+        path: Path,
+        mimeType: String? = null,
+        config: ExtractionConfig = ExtractionConfig.builder().build(),
+    ): ExtractionResult = Kreuzberg.extractFile(path, mimeType, config)
+}
+
+suspend fun extractCustomPayload() {
+    val client = GenericExtractorClient()
+    val bytes = Files.readAllBytes(Path.of("payload.json"))
+    val result = client.extractBytes(bytes, mimeType = "application/json")
+    println("Extracted ${result.content().length} chars")
+}
+```
--- a/docs/snippets/kotlin/plugins/plugin_logging.md
+++ b/docs/snippets/kotlin/plugins/plugin_logging.md
@@ -0,0 +1,41 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.logging.Logger
+
+class LoggingPostProcessor : IPostProcessor {
+    private val log: Logger = Logger.getLogger(LoggingPostProcessor::class.java.name)
+
+    override fun name(): String = "logging-post-processor"
+    override fun version(): String = "1.0.0"
+
+    override fun initialize() {
+        log.info("Initializing plugin: ${name()}")
+    }
+
+    override fun shutdown() {
+        log.info("Shutting down plugin: ${name()}")
+    }
+
+    override fun process(result: ExtractionResult, config: ExtractionConfig) {
+        log.info("Processing ${result.mimeType()} (${result.content().length} chars)")
+        if (result.content().isEmpty()) {
+            log.warning("Extraction resulted in empty content")
+        }
+    }
+
+    override fun processing_stage(): ProcessingStage = ProcessingStage.Late
+
+    override fun should_process(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = true
+
+    override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
+
+    override fun priority(): Int = 10
+}
+
+fun registerLoggingPostProcessor() {
+    PostProcessorBridge.registerPostProcessor(LoggingPostProcessor())
+}
+```
--- a/docs/snippets/kotlin/plugins/plugin_testing.md
+++ b/docs/snippets/kotlin/plugins/plugin_testing.md
@@ -0,0 +1,55 @@
+<!-- snippet:skip reason="kotlin.test is not on the snippet-runner classpath; the plugin-testing pattern documented here cannot compile under the runner's lightweight Kotlin profile. Run these tests from a real Gradle build." -->
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import dev.kreuzberg.kt.Kreuzberg
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFailsWith
+import kotlin.test.assertTrue
+
+class MinLengthValidatorTest {
+
+    private fun makeResult(content: String): ExtractionResult =
+        ExtractionResult.builder()
+            .content(content)
+            .mimeType("text/plain")
+            .metadata(Metadata.builder().build())
+            .tables(emptyList())
+            .processingWarnings(emptyList())
+            .build()
+
+    @Test
+    fun `validate accepts content above minimum length`() {
+        val validator = MinLengthValidator(minLength = 5)
+        val result = makeResult("hello world")
+        validator.validate(result, ExtractionConfig.builder().build())
+    }
+
+    @Test
+    fun `validate rejects content below minimum length`() {
+        val validator = MinLengthValidator(minLength = 100)
+        val result = makeResult("too short")
+        assertFailsWith<IllegalStateException> {
+            validator.validate(result, ExtractionConfig.builder().build())
+        }
+    }
+
+    @Test
+    fun `priority and name are stable`() {
+        val validator = MinLengthValidator(minLength = 1)
+        assertEquals("min-length-validator", validator.name())
+        assertEquals(100, validator.priority())
+        assertTrue(validator.should_validate(makeResult(""), ExtractionConfig.builder().build()))
+    }
+
+    @Test
+    fun `registration round-trip exposes the plugin in the listing`() {
+        ValidatorBridge.registerValidator(MinLengthValidator(minLength = 1))
+        try {
+            assertTrue("min-length-validator" in Kreuzberg.listValidators())
+        } finally {
+            ValidatorBridge.unregisterValidator("min-length-validator")
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/plugins/plugin_validator.md
+++ b/docs/snippets/kotlin/plugins/plugin_validator.md
@@ -0,0 +1,44 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+// Generic validator pattern: every IValidator has the same shape.
+// `name()` keys the registry, `priority()` orders execution (higher = earlier),
+// `should_validate()` is a fast skip-check, and `validate()` throws on failure.
+class GenericValidator(
+    private val pluginName: String,
+    private val pluginPriority: Int,
+    private val check: (ExtractionResult, ExtractionConfig) -> Unit,
+) : IValidator {
+    override fun name(): String = pluginName
+    override fun version(): String = "1.0.0"
+
+    override fun initialize() {
+        // Optional: open resources, load config files, etc.
+    }
+
+    override fun shutdown() {
+        // Optional: release resources held in initialize().
+    }
+
+    override fun validate(result: ExtractionResult, config: ExtractionConfig) {
+        check(result, config)
+    }
+
+    override fun should_validate(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = true
+
+    override fun priority(): Int = pluginPriority
+}
+
+fun registerGenericValidator() {
+    val validator = GenericValidator(
+        pluginName = "non-empty-content",
+        pluginPriority = 200,
+    ) { result, _ ->
+        require(result.content().isNotBlank()) { "Extracted content is blank" }
+    }
+    ValidatorBridge.registerValidator(validator)
+}
+```
--- a/docs/snippets/kotlin/plugins/quality_score_validator.md
+++ b/docs/snippets/kotlin/plugins/quality_score_validator.md
@@ -0,0 +1,28 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+class QualityScoreValidator(private val threshold: Double = 0.5) : IValidator {
+    override fun name(): String = "quality-score-validator"
+    override fun version(): String = "1.0.0"
+
+    override fun validate(result: ExtractionResult, config: ExtractionConfig) {
+        val score = result.qualityScore() ?: 0.0
+        if (score < threshold) {
+            throw IllegalStateException(
+                "Quality score too low: %.2f < %.2f".format(score, threshold),
+            )
+        }
+    }
+
+    override fun should_validate(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = _result.qualityScore() != null
+
+    override fun priority(): Int = 50
+}
+
+fun registerQualityScoreValidator() {
+    ValidatorBridge.registerValidator(QualityScoreValidator(threshold = 0.5))
+}
+```
--- a/docs/snippets/kotlin/plugins/stateful_plugin.md
+++ b/docs/snippets/kotlin/plugins/stateful_plugin.md
@@ -0,0 +1,47 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.AtomicLong
+
+class StatefulPlugin : IPostProcessor {
+    private val callCount = AtomicLong(0)
+    private val cache: ConcurrentHashMap<String, String> = ConcurrentHashMap()
+
+    override fun name(): String = "stateful-plugin"
+    override fun version(): String = "1.0.0"
+
+    override fun initialize() {
+        callCount.set(0)
+        cache.clear()
+    }
+
+    override fun shutdown() {
+        println("Plugin called ${callCount.get()} times")
+        cache.clear()
+    }
+
+    override fun process(result: ExtractionResult, config: ExtractionConfig) {
+        val count = callCount.incrementAndGet()
+        cache["last_mime"] = result.mimeType()
+        cache["last_call"] = count.toString()
+    }
+
+    override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
+
+    override fun should_process(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = true
+
+    override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
+
+    override fun priority(): Int = 50
+
+    fun callCount(): Long = callCount.get()
+    fun lastMime(): String? = cache["last_mime"]
+}
+
+fun registerStatefulPlugin() {
+    PostProcessorBridge.registerPostProcessor(StatefulPlugin())
+}
+```
--- a/docs/snippets/kotlin/plugins/unregister_plugins.md
+++ b/docs/snippets/kotlin/plugins/unregister_plugins.md
@@ -0,0 +1,12 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+fun unregisterPlugins() {
+    // Each plugin type has a static unregister helper on its bridge class.
+    // The string argument is the name returned by the plugin's name() method.
+    PostProcessorBridge.unregisterPostProcessor("word-count")
+    ValidatorBridge.unregisterValidator("min-length-validator")
+    OcrBackendBridge.unregisterOcrBackend("my-ocr-backend")
+    EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder")
+}
+```
--- a/docs/snippets/kotlin/plugins/word_count_processor.md
+++ b/docs/snippets/kotlin/plugins/word_count_processor.md
@@ -0,0 +1,30 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+
+class WordCountProcessor : IPostProcessor {
+    override fun name(): String = "word-count"
+    override fun version(): String = "1.0.0"
+
+    override fun process(result: ExtractionResult, config: ExtractionConfig) {
+        val wordCount = result.content().split(Regex("\\s+")).count { it.isNotEmpty() }
+        // ExtractionResult is an immutable record on the Java side; observe
+        // and report rather than mutate.
+        println("[word-count] ${result.mimeType()} -> $wordCount words")
+    }
+
+    override fun processing_stage(): ProcessingStage = ProcessingStage.Early
+
+    override fun should_process(
+        _result: ExtractionResult,
+        _config: ExtractionConfig,
+    ): Boolean = _result.content().isNotEmpty()
+
+    override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
+
+    override fun priority(): Int = 50
+}
+
+fun registerWordCountProcessor() {
+    PostProcessorBridge.registerPostProcessor(WordCountProcessor())
+}
+```
--- a/docs/snippets/kotlin/utils/chunking.md
+++ b/docs/snippets/kotlin/utils/chunking.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1500L)
+        .withOverlap(200L)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/utils/chunking_rag.md
+++ b/docs/snippets/kotlin/utils/chunking_rag.md
@@ -0,0 +1,35 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(500L)
+        .withOverlap(50L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
+    for (chunk in result.chunks().orEmpty()) {
+        val metadata = chunk.metadata()
+        println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
+        println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
+        val text = chunk.content()
+        val preview = text.take(100)
+        println("Content: $preview...")
+        chunk.embedding()?.let { vector ->
+            println("Embedding: ${vector.size} dimensions")
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/utils/embedding_with_chunking.md
+++ b/docs/snippets/kotlin/utils/embedding_with_chunking.md
@@ -0,0 +1,27 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .withBatchSize(32L)
+        .withShowDownloadProgress(false)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(1024L)
+        .withOverlap(100L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
+}
+```
--- a/docs/snippets/kotlin/utils/keyword_extraction_example.md
+++ b/docs/snippets/kotlin/utils/keyword_extraction_example.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val keywords = KeywordConfig.builder()
+        .withAlgorithm(KeywordAlgorithm.Yake)
+        .withMaxKeywords(10L)
+        .withMinScore(0.3f)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withKeywords(Optional.of(keywords))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
+    result.extractedKeywords()?.let { extracted ->
+        println("Keywords: $extracted")
+    }
+}
+```
--- a/docs/snippets/kotlin/utils/quality_processing_example.md
+++ b/docs/snippets/kotlin/utils/quality_processing_example.md
@@ -0,0 +1,22 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val config = ExtractionConfig.builder()
+        .withEnableQualityProcessing(true)
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
+
+    val score = result.qualityScore()
+    if (score != null) {
+        if (score < 0.5) {
+            println("Warning: Low quality extraction (%.2f)".format(score))
+        } else {
+            println("Quality score: %.2f".format(score))
+        }
+    }
+}
+```
--- a/docs/snippets/kotlin/utils/standalone_embed.md
+++ b/docs/snippets/kotlin/utils/standalone_embed.md
@@ -0,0 +1,17 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.util.Optional
+
+fun main() {
+    val config = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .build()
+
+    val texts = listOf("Hello, world!", "Kreuzberg is fast")
+    val embeddings = Kreuzberg.embedTexts(texts, config)
+
+    println("Texts embedded: ${embeddings.size}")
+    println("Dimensions: ${embeddings[0].size}")
+}
+```
--- a/docs/snippets/kotlin/utils/token_reduction.md
+++ b/docs/snippets/kotlin/utils/token_reduction.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withTokenReduction(Optional.of(tokenReduction))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
+    println(result.content())
+}
+```
--- a/docs/snippets/kotlin/utils/token_reduction_example.md
+++ b/docs/snippets/kotlin/utils/token_reduction_example.md
@@ -0,0 +1,19 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+fun main() {
+    val tokenReduction = TokenReductionOptions.builder()
+        .withMode("moderate")
+        .withPreserveImportantWords(true)
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withTokenReduction(Optional.of(tokenReduction))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
+    println("Reduced content length: ${result.content().length}")
+}
+```
--- a/docs/snippets/kotlin/utils/vector_database_integration.md
+++ b/docs/snippets/kotlin/utils/vector_database_integration.md
@@ -0,0 +1,52 @@
+```kotlin title="Kotlin"
+import dev.kreuzberg.*
+import java.nio.file.Paths
+import java.util.Optional
+
+data class VectorRecord(
+    val id: String,
+    val content: String,
+    val embedding: List<Float>,
+    val metadata: Map<String, String>
+)
+
+fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
+    val embedding = EmbeddingConfig.builder()
+        .withModel(EmbeddingModelType.Preset("balanced"))
+        .withNormalize(true)
+        .withBatchSize(32L)
+        .build()
+
+    val chunking = ChunkingConfig.builder()
+        .withMaxCharacters(512L)
+        .withOverlap(50L)
+        .withEmbedding(Optional.of(embedding))
+        .build()
+
+    val config = ExtractionConfig.builder()
+        .withChunking(Optional.of(chunking))
+        .build()
+
+    val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
+
+    val records = mutableListOf<VectorRecord>()
+    val chunks = result.chunks().orEmpty()
+    for ((index, chunk) in chunks.withIndex()) {
+        val vector = chunk.embedding()
+        if (vector != null) {
+            val metadata = mapOf(
+                "document_id" to documentId,
+                "chunk_index" to index.toString(),
+                "content_length" to chunk.content().length.toString()
+            )
+            records += VectorRecord(
+                id = "${documentId}_chunk_$index",
+                content = chunk.content(),
+                embedding = vector,
+                metadata = metadata
+            )
+        }
+    }
+    return records
+}
+```