This commit is contained in:
32
docs/snippets/kotlin/advanced/chunk_page_mapping.md
Normal file
32
docs/snippets/kotlin/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.build()
|
||||
|
||||
val pages = PageConfig.builder()
|
||||
.withExtractPages(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withPages(Optional.of(pages))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
val first = chunk.metadata().firstPage()
|
||||
val last = chunk.metadata().lastPage()
|
||||
if (first != null && last != null) {
|
||||
val pageRange = if (first == last) "Page $first" else "Pages $first-$last"
|
||||
val preview = chunk.content().take(50)
|
||||
println("Chunk: $preview... ($pageRange)")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/kotlin/advanced/chunking_config.md
Normal file
60
docs/snippets/kotlin/advanced/chunking_config.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Semantic"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withChunkerType(ChunkerType.Semantic)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Prepend Heading Context"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withChunkerType(ChunkerType.Markdown)
|
||||
.withPrependHeadingContext(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/kotlin/advanced/chunking_rag.md
Normal file
35
docs/snippets/kotlin/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
val metadata = chunk.metadata()
|
||||
println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
|
||||
println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
|
||||
val text = chunk.content()
|
||||
val preview = text.take(100)
|
||||
println("Content: $preview...")
|
||||
chunk.embedding()?.let { embedding ->
|
||||
println("Embedding: ${embedding.size} dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/kotlin/advanced/embedding_with_chunking.md
Normal file
27
docs/snippets/kotlin/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.withShowDownloadProgress(false)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1024L)
|
||||
.withOverlap(100L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/advanced/keyword_extraction_config.md
Normal file
22
docs/snippets/kotlin/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.3f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Keywords: ${result.extractedKeywords()}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/advanced/keyword_extraction_example.md
Normal file
22
docs/snippets/kotlin/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.3f)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
result.extractedKeywords()?.let { extracted ->
|
||||
println("Keywords: $extracted")
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/advanced/language_detection_config.md
Normal file
20
docs/snippets/kotlin/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
14
docs/snippets/kotlin/advanced/quality_processing_config.md
Normal file
14
docs/snippets/kotlin/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Quality score: ${result.qualityScore()}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/advanced/quality_processing_example.md
Normal file
22
docs/snippets/kotlin/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
|
||||
|
||||
val score = result.qualityScore()
|
||||
if (score != null) {
|
||||
if (score < 0.5) {
|
||||
println("Warning: Low quality extraction (%.2f)".format(score))
|
||||
} else {
|
||||
println("Quality score: %.2f".format(score))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/advanced/token_reduction_config.md
Normal file
19
docs/snippets/kotlin/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Reduced content: ${result.content()}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/advanced/token_reduction_example.md
Normal file
19
docs/snippets/kotlin/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
|
||||
println("Reduced content length: ${result.content().length}")
|
||||
}
|
||||
```
|
||||
52
docs/snippets/kotlin/advanced/vector_database_integration.md
Normal file
52
docs/snippets/kotlin/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
data class VectorRecord(
|
||||
val id: String,
|
||||
val content: String,
|
||||
val embedding: List<Float>,
|
||||
val metadata: Map<String, String>
|
||||
)
|
||||
|
||||
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(512L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
|
||||
|
||||
val records = mutableListOf<VectorRecord>()
|
||||
val chunks = result.chunks().orEmpty()
|
||||
for ((index, chunk) in chunks.withIndex()) {
|
||||
val vector = chunk.embedding()
|
||||
if (vector != null) {
|
||||
val metadata = mapOf(
|
||||
"document_id" to documentId,
|
||||
"chunk_index" to index.toString(),
|
||||
"content_length" to chunk.content().length.toString()
|
||||
)
|
||||
records += VectorRecord(
|
||||
id = "${documentId}_chunk_$index",
|
||||
content = chunk.content(),
|
||||
embedding = vector,
|
||||
metadata = metadata
|
||||
)
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
16
docs/snippets/kotlin/api/batch_extract_bytes_sync.md
Normal file
16
docs/snippets/kotlin/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val items = listOf(
|
||||
BatchBytesItem("Hello, world!".toByteArray(), "text/plain", null),
|
||||
BatchBytesItem("# Heading\n\nParagraph text.".toByteArray(), "text/markdown", null),
|
||||
)
|
||||
val results = Kreuzberg.batchExtractBytesSync(items, config)
|
||||
|
||||
results.forEachIndexed { index, result ->
|
||||
println("Item $index: ${result.content().length} chars")
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/kotlin/api/batch_extract_files_sync.md
Normal file
18
docs/snippets/kotlin/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val items = listOf(
|
||||
BatchFileItem(Paths.get("doc1.pdf"), null),
|
||||
BatchFileItem(Paths.get("doc2.docx"), null),
|
||||
BatchFileItem(Paths.get("report.pdf"), null),
|
||||
)
|
||||
val results = Kreuzberg.batchExtractFilesSync(items, config)
|
||||
|
||||
results.forEachIndexed { index, result ->
|
||||
println("File $index: ${result.content().length} chars")
|
||||
}
|
||||
}
|
||||
```
|
||||
31
docs/snippets/kotlin/api/client_chunk_text.md
Normal file
31
docs/snippets/kotlin/api/client_chunk_text.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```kotlin title="Kotlin"
|
||||
import java.net.URI
|
||||
import java.net.http.HttpClient
|
||||
import java.net.http.HttpRequest
|
||||
import java.net.http.HttpResponse
|
||||
|
||||
fun main() {
|
||||
val client = HttpClient.newHttpClient()
|
||||
val json = """
|
||||
{
|
||||
"text": "Your long text here...",
|
||||
"chunker_type": "text",
|
||||
"config": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 50,
|
||||
"trim": true
|
||||
}
|
||||
}
|
||||
""".trimIndent()
|
||||
|
||||
val request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://localhost:8000/chunk"))
|
||||
.header("Content-Type", "application/json")
|
||||
.POST(HttpRequest.BodyPublishers.ofString(json))
|
||||
.build()
|
||||
|
||||
val response = client.send(request, HttpResponse.BodyHandlers.ofString())
|
||||
println("Status: ${response.statusCode()}")
|
||||
println(response.body())
|
||||
}
|
||||
```
|
||||
38
docs/snippets/kotlin/api/client_extract_single_file.md
Normal file
38
docs/snippets/kotlin/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```kotlin title="Kotlin"
|
||||
import java.net.URI
|
||||
import java.net.http.HttpClient
|
||||
import java.net.http.HttpRequest
|
||||
import java.net.http.HttpResponse
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val client = HttpClient.newHttpClient()
|
||||
val path = Paths.get("document.pdf")
|
||||
val bytes = Files.readAllBytes(path)
|
||||
val fileName = path.fileName.toString()
|
||||
|
||||
val boundary = "----KreuzbergBoundary${System.currentTimeMillis()}"
|
||||
val crlf = "\r\n"
|
||||
val header = (
|
||||
"--$boundary$crlf" +
|
||||
"Content-Disposition: form-data; name=\"file\"; filename=\"$fileName\"$crlf" +
|
||||
"Content-Type: application/pdf$crlf$crlf"
|
||||
).toByteArray()
|
||||
val footer = "$crlf--$boundary--$crlf".toByteArray()
|
||||
|
||||
val body = ByteArray(header.size + bytes.size + footer.size)
|
||||
System.arraycopy(header, 0, body, 0, header.size)
|
||||
System.arraycopy(bytes, 0, body, header.size, bytes.size)
|
||||
System.arraycopy(footer, 0, body, header.size + bytes.size, footer.size)
|
||||
|
||||
val request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://localhost:8000/extract"))
|
||||
.header("Content-Type", "multipart/form-data; boundary=$boundary")
|
||||
.POST(HttpRequest.BodyPublishers.ofByteArray(body))
|
||||
.build()
|
||||
|
||||
val response = client.send(request, HttpResponse.BodyHandlers.ofString())
|
||||
println(response.body())
|
||||
}
|
||||
```
|
||||
45
docs/snippets/kotlin/api/combining_all_features.md
Normal file
45
docs/snippets/kotlin/api/combining_all_features.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(800L)
|
||||
.withOverlap(100L)
|
||||
.withChunkerType(ChunkerType.MARKDOWN)
|
||||
.withPrependHeadingContext(true)
|
||||
.build()
|
||||
|
||||
val images = ImageExtractionConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withForceOcr(false)
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withOutputFormat(OutputFormat.MARKDOWN)
|
||||
.withIncludeDocumentStructure(true)
|
||||
.withImages(Optional.of(images))
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("report.pdf"), null, config)
|
||||
|
||||
val content = result.content()
|
||||
println("Content (${content.length} chars):")
|
||||
println(content.take(200))
|
||||
|
||||
result.chunks()?.let { println("\nChunks: ${it.size}") }
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
result.detectedLanguages()?.let { println("Languages: $it") }
|
||||
result.extractionMethod()?.let { println("Extraction method: $it") }
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/api/error_handling.md
Normal file
17
docs/snippets/kotlin/api/error_handling.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
try {
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
} catch (e: KreuzbergRsException) {
|
||||
System.err.println("Extraction failed: ${e.message}")
|
||||
System.err.println("Error code: ${e.code}")
|
||||
} catch (e: Exception) {
|
||||
System.err.println("Unexpected error: ${e.message}")
|
||||
}
|
||||
}
|
||||
```
|
||||
28
docs/snippets/kotlin/api/error_handling_extract.md
Normal file
28
docs/snippets/kotlin/api/error_handling_extract.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun extractText(bytes: ByteArray, mimeType: String): String {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractBytesSync(bytes, mimeType, config)
|
||||
return result.content()
|
||||
}
|
||||
|
||||
fun main() {
|
||||
val bytes = try {
|
||||
Files.readAllBytes(Paths.get("document.pdf"))
|
||||
} catch (e: Exception) {
|
||||
ByteArray(0)
|
||||
}
|
||||
|
||||
try {
|
||||
val text = extractText(bytes, "application/pdf")
|
||||
println("Extracted ${text.length} chars")
|
||||
} catch (e: KreuzbergRsException) {
|
||||
System.err.println("Extraction error (code=${e.code}): ${e.message}")
|
||||
} catch (e: Exception) {
|
||||
System.err.println("Unexpected error: ${e.message}")
|
||||
}
|
||||
}
|
||||
```
|
||||
16
docs/snippets/kotlin/api/extract_bytes_async.md
Normal file
16
docs/snippets/kotlin/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() = runBlocking {
|
||||
val content = Files.readAllBytes(Paths.get("document.pdf"))
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractBytes(content, "application/pdf", config)
|
||||
|
||||
println(result.content())
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
14
docs/snippets/kotlin/api/extract_bytes_sync.md
Normal file
14
docs/snippets/kotlin/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val content = Files.readAllBytes(Paths.get("document.pdf"))
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractBytesSync(content, "application/pdf", config)
|
||||
|
||||
println(result.content())
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
15
docs/snippets/kotlin/api/extract_file_async.md
Normal file
15
docs/snippets/kotlin/api/extract_file_async.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() = runBlocking {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFile(Paths.get("document.pdf"), null, config)
|
||||
|
||||
println(result.content())
|
||||
println("MIME type: ${result.mimeType()}")
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
13
docs/snippets/kotlin/api/extract_file_sync.md
Normal file
13
docs/snippets/kotlin/api/extract_file_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
println(result.content())
|
||||
println("MIME type: ${result.mimeType()}")
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withBatchSize(32L)
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.1f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val postprocessor = PostProcessorConfig.builder()
|
||||
.withEnabled(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.withPostprocessor(Optional.of(postprocessor))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Content: ${result.content()}")
|
||||
result.detectedLanguages()?.let { println("Languages: $it") }
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
81
docs/snippets/kotlin/config/chunking_config.md
Normal file
81
docs/snippets/kotlin/config/chunking_config.md
Normal file
@@ -0,0 +1,81 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val chunks = result.chunks().orEmpty()
|
||||
println("Chunks: ${chunks.size}")
|
||||
for (chunk in chunks) {
|
||||
println("Length: ${chunk.content().length}")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Markdown with Heading Context"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val sizing = ChunkSizing.Tokenizer("Xenova/gpt-4o", Optional.empty())
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withChunkerType(ChunkerType.Markdown)
|
||||
.withSizing(sizing)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
chunk.metadata()?.headingContext()?.let { ctx ->
|
||||
for (heading in ctx.headings()) {
|
||||
println("Heading L${heading.level()}: ${heading.text()}")
|
||||
}
|
||||
}
|
||||
val text = chunk.content()
|
||||
println("Content: ${text.take(100)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Prepend Heading Context"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withChunkerType(ChunkerType.Markdown)
|
||||
.withPrependHeadingContext(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
val text = chunk.content()
|
||||
println("Content: ${text.take(100)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/kotlin/config/config_basic.md
Normal file
15
docs/snippets/kotlin/config/config_basic.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/config/config_discover.md
Normal file
17
docs/snippets/kotlin/config/config_discover.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
// Java/Kotlin bindings construct configuration explicitly via the builder.
|
||||
// Equivalent to ExtractionConfig::discover() in Rust: load defaults and override
|
||||
// any fields you want to override.
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/config/config_ocr.md
Normal file
20
docs/snippets/kotlin/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println("Content length: ${result.content().length}")
|
||||
println("Tables detected: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
32
docs/snippets/kotlin/config/config_programmatic.md
Normal file
32
docs/snippets/kotlin/config/config_programmatic.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tesseract = TesseractConfig.builder()
|
||||
.withPsm(6)
|
||||
.build()
|
||||
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng+deu")
|
||||
.withTesseractConfig(Optional.of(tesseract))
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Content length: ${result.content().length}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/document_structure_config.md
Normal file
19
docs/snippets/kotlin/config/document_structure_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withIncludeDocumentStructure(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val document = result.document()
|
||||
if (document != null) {
|
||||
for (node in document.nodes()) {
|
||||
println(node)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
32
docs/snippets/kotlin/config/element_based_output.md
Normal file
32
docs/snippets/kotlin/config/element_based_output.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
// Configure element-based output (resultFormat controls Unified vs ElementBased)
|
||||
val config = ExtractionConfig.builder()
|
||||
.withResultFormat(ResultFormat.ElementBased)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val elements = result.elements().orEmpty()
|
||||
for (element in elements) {
|
||||
println("Type: ${element.elementType()}")
|
||||
val text = element.text()
|
||||
println("Text: ${text.take(100)}")
|
||||
|
||||
element.metadata().pageNumber()?.let { page ->
|
||||
println("Page: $page")
|
||||
}
|
||||
println("---")
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
val titles = elements.filter { it.elementType() == ElementType.Title }
|
||||
for (title in titles) {
|
||||
println("Title: ${title.text()}")
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/kotlin/config/embedding_config.md
Normal file
27
docs/snippets/kotlin/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withBatchSize(16L)
|
||||
.withNormalize(true)
|
||||
.withShowDownloadProgress(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/html_output.md
Normal file
19
docs/snippets/kotlin/config/html_output.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val htmlOutput = HtmlOutputConfig.builder()
|
||||
.withTheme(HtmlTheme.GitHub)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOutputFormat(OutputFormat.Html)
|
||||
.withHtmlOutput(Optional.of(htmlOutput))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content()) // HTML with kb-* classes
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/config/keyword_extraction_config.md
Normal file
22
docs/snippets/kotlin/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.1f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Keywords: ${result.extractedKeywords()}")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/config/language_detection_config.md
Normal file
20
docs/snippets/kotlin/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
23
docs/snippets/kotlin/config/ocr_dpi_config.md
Normal file
23
docs/snippets/kotlin/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val images = ImageExtractionConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.withTargetDpi(300)
|
||||
.withMaxImageDimension(4096)
|
||||
.withAutoAdjustDpi(true)
|
||||
.withMinDpi(150)
|
||||
.withMaxDpi(600)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withImages(Optional.of(images))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted images: ${result.images()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/pdf_config.md
Normal file
26
docs/snippets/kotlin/config/pdf_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val hierarchy = HierarchyConfig.builder()
|
||||
.withEnabled(true)
|
||||
.build()
|
||||
|
||||
val pdf = PdfConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.withPasswords(Optional.of(listOf("password123")))
|
||||
.withExtractMetadata(true)
|
||||
.withHierarchy(Optional.of(hierarchy))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPdfOptions(Optional.of(pdf))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("encrypted.pdf"), null, config)
|
||||
println("Title: ${result.metadata().title()}")
|
||||
println("Authors: ${result.metadata().authors()}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/pdf_hierarchy_config.md
Normal file
26
docs/snippets/kotlin/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val hierarchy = HierarchyConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withKClusters(5L)
|
||||
.withIncludeBbox(true)
|
||||
.withOcrCoverageThreshold(Optional.of(0.8f))
|
||||
.build()
|
||||
|
||||
val pdf = PdfConfig.builder()
|
||||
.withHierarchy(Optional.of(hierarchy))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPdfOptions(Optional.of(pdf))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val pages = result.pages().orEmpty()
|
||||
println("Pages: ${pages.size}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/config/postprocessor_config.md
Normal file
22
docs/snippets/kotlin/config/postprocessor_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val postprocessor = PostProcessorConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withEnabledProcessors(Optional.of(listOf(
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
)))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPostprocessor(Optional.of(postprocessor))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Processed content: ${result.content()}")
|
||||
}
|
||||
```
|
||||
16
docs/snippets/kotlin/config/quality_processing_config.md
Normal file
16
docs/snippets/kotlin/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.withUseCache(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Quality score: ${result.qualityScore()}")
|
||||
println("Warnings: ${result.processingWarnings()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/tesseract_config.md
Normal file
26
docs/snippets/kotlin/config/tesseract_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tesseract = TesseractConfig.builder()
|
||||
.withLanguage("eng+deu")
|
||||
.withPsm(6)
|
||||
.withOem(3)
|
||||
.build()
|
||||
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng+deu")
|
||||
.withTesseractConfig(Optional.of(tesseract))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println("OCR text: ${result.content()}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/token_reduction_config.md
Normal file
19
docs/snippets/kotlin/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Reduced content: ${result.content()}")
|
||||
}
|
||||
```
|
||||
11
docs/snippets/kotlin/getting-started/basic_usage.md
Normal file
11
docs/snippets/kotlin/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
println("MIME type: ${result.mimeType()}")
|
||||
}
|
||||
```
|
||||
13
docs/snippets/kotlin/getting-started/extract_file.md
Normal file
13
docs/snippets/kotlin/getting-started/extract_file.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
println(result.content())
|
||||
println("MIME type: ${result.mimeType()}")
|
||||
println("Tables: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
21
docs/snippets/kotlin/getting-started/extract_with_ocr.md
Normal file
21
docs/snippets/kotlin/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withForceOcr(true)
|
||||
.build()
|
||||
|
||||
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println(result.content())
|
||||
result.detectedLanguages()?.let { println("Detected languages: $it") }
|
||||
}
|
||||
```
|
||||
11
docs/snippets/kotlin/getting-started/hello_world.md
Normal file
11
docs/snippets/kotlin/getting-started/hello_world.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
println("Hello from Kreuzberg!")
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
8
docs/snippets/kotlin/getting-started/install_verify.md
Normal file
8
docs/snippets/kotlin/getting-started/install_verify.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
println("Kreuzberg loaded: ${config != null}")
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/getting-started/read_content.md
Normal file
17
docs/snippets/kotlin/getting-started/read_content.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = dev.kreuzberg.Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
result.tables()?.forEachIndexed { index, table ->
|
||||
println("Table ${index + 1}: ${table}")
|
||||
}
|
||||
|
||||
result.chunks()?.forEachIndexed { index, chunk ->
|
||||
println("Chunk ${index + 1}: ${chunk}")
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/kotlin/llm/structured_extraction.md
Normal file
38
docs/snippets/kotlin/llm/structured_extraction.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val schema = mapOf(
|
||||
"type" to "object",
|
||||
"properties" to mapOf(
|
||||
"title" to mapOf("type" to "string"),
|
||||
"authors" to mapOf("type" to "array", "items" to mapOf("type" to "string")),
|
||||
"date" to mapOf("type" to "string")
|
||||
),
|
||||
"required" to listOf("title", "authors", "date"),
|
||||
"additionalProperties" to false
|
||||
)
|
||||
|
||||
val llm = LlmConfig.builder()
|
||||
.withModel("openai/gpt-4o-mini")
|
||||
.build()
|
||||
|
||||
val structured = StructuredExtractionConfig(
|
||||
schema,
|
||||
"document",
|
||||
null,
|
||||
true,
|
||||
null,
|
||||
llm
|
||||
)
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withStructuredExtraction(Optional.of(structured))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("paper.pdf"), null, config)
|
||||
result.structuredOutput()?.let { println(it) }
|
||||
}
|
||||
```
|
||||
32
docs/snippets/kotlin/mcp/mcp_custom_client.md
Normal file
32
docs/snippets/kotlin/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.Optional
|
||||
import java.io.BufferedReader
|
||||
import java.io.BufferedWriter
|
||||
import java.io.InputStreamReader
|
||||
import java.io.OutputStreamWriter
|
||||
|
||||
fun main() {
|
||||
val process = ProcessBuilder("kreuzberg", "mcp")
|
||||
.redirectErrorStream(true)
|
||||
.start()
|
||||
|
||||
val stdin = BufferedWriter(OutputStreamWriter(process.outputStream))
|
||||
val stdout = BufferedReader(InputStreamReader(process.inputStream))
|
||||
|
||||
val request = """
|
||||
{"method":"tools/call","params":{"name":"extract_file","arguments":{"path":"document.pdf","async":true}}}
|
||||
""".trimIndent()
|
||||
|
||||
stdin.write(request)
|
||||
stdin.newLine()
|
||||
stdin.flush()
|
||||
|
||||
val response = stdout.readLine()
|
||||
println(response)
|
||||
|
||||
stdin.close()
|
||||
stdout.close()
|
||||
process.destroy()
|
||||
}
|
||||
```
|
||||
11
docs/snippets/kotlin/mcp/mcp_server_start.md
Normal file
11
docs/snippets/kotlin/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val process = ProcessBuilder("kreuzberg", "mcp")
|
||||
.inheritIO()
|
||||
.start()
|
||||
process.waitFor()
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/metadata/language_detection.md
Normal file
20
docs/snippets/kotlin/metadata/language_detection.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.9)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,25 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
|
||||
|
||||
val detected = result.detectedLanguages() ?: emptyList()
|
||||
println("Detected languages: $detected")
|
||||
for (language in detected) {
|
||||
println(" - $language")
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/kotlin/metadata/metadata.md
Normal file
60
docs/snippets/kotlin/metadata/metadata.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val metadata = result.metadata()
|
||||
metadata.title()?.let { println("Title: $it") }
|
||||
metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }
|
||||
|
||||
// Format-specific metadata via discriminated union
|
||||
metadata.format()?.pdf()?.let { pdf ->
|
||||
pdf.pageCount()?.let { println("Pages: $it") }
|
||||
pdf.producer()?.let { println("Producer: $it") }
|
||||
pdf.pdfVersion()?.let { println("PDF Version: $it") }
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
|
||||
htmlResult.metadata().format()?.html()?.let { html ->
|
||||
html.title()?.let { println("Title: $it") }
|
||||
html.description()?.let { println("Description: $it") }
|
||||
html.canonicalUrl()?.let { println("Canonical URL: $it") }
|
||||
html.language()?.let { println("Language: $it") }
|
||||
|
||||
// Access keywords list
|
||||
println("Keywords: ${html.keywords()}")
|
||||
|
||||
// Open Graph fields are exposed as a Map<String, String>
|
||||
html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
|
||||
html.openGraph()["title"]?.let { println("Open Graph Title: $it") }
|
||||
|
||||
// Twitter Card fields as a Map<String, String>
|
||||
html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }
|
||||
|
||||
// Headers
|
||||
for (header in html.headers()) {
|
||||
println("Header (level ${header.level()}): ${header.text()}")
|
||||
}
|
||||
|
||||
// Links
|
||||
for (link in html.links()) {
|
||||
println("Link: ${link.href()} (${link.text()})")
|
||||
}
|
||||
|
||||
// Images
|
||||
for (image in html.images()) {
|
||||
println("Image: ${image.src()}")
|
||||
}
|
||||
|
||||
// Structured data
|
||||
if (html.structuredData().isNotEmpty()) {
|
||||
println("Structured data items: ${html.structuredData().size}")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/kotlin/metadata/page_boundaries.md
Normal file
24
docs/snippets/kotlin/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val pages = result.metadata().pages() ?: return
|
||||
val boundaries = pages.boundaries() ?: return
|
||||
|
||||
val content = result.content()
|
||||
for (boundary in boundaries.take(3)) {
|
||||
val start = boundary.byteStart().toInt()
|
||||
val end = boundary.byteEnd().toInt()
|
||||
val pageText = content.substring(start, end)
|
||||
val previewEnd = minOf(100, pageText.length)
|
||||
|
||||
println("Page ${boundary.pageNumber()}:")
|
||||
println(" Byte range: $start-$end")
|
||||
println(" Preview: ${pageText.substring(0, previewEnd)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/kotlin/metadata/page_tracking_basic.md
Normal file
25
docs/snippets/kotlin/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val pageConfig = PageConfig.builder()
|
||||
.withExtractPages(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPages(Optional.of(pageConfig))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val pages = result.pages() ?: return
|
||||
for (page in pages) {
|
||||
println("Page ${page.pageNumber()}:")
|
||||
println(" Content: ${page.content().length} chars")
|
||||
println(" Tables: ${page.tables().size}")
|
||||
println(" Images: ${page.images().size}")
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/metadata/tables.md
Normal file
19
docs/snippets/kotlin/metadata/tables.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val tables = result.tables() ?: emptyList()
|
||||
for (table in tables) {
|
||||
println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
|
||||
println(table.markdown())
|
||||
|
||||
for (row in table.cells()) {
|
||||
println(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
57
docs/snippets/kotlin/metadata/vector_database_integration.md
Normal file
57
docs/snippets/kotlin/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,57 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
data class VectorRecord(
|
||||
val id: String,
|
||||
val content: String,
|
||||
val embedding: List<Float>,
|
||||
val metadata: Map<String, String>,
|
||||
)
|
||||
|
||||
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(512L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
|
||||
|
||||
val records = mutableListOf<VectorRecord>()
|
||||
val chunks = result.chunks() ?: return records
|
||||
for ((index, chunk) in chunks.withIndex()) {
|
||||
val vector = chunk.embedding() ?: continue
|
||||
val metadata = mapOf(
|
||||
"document_id" to documentId,
|
||||
"chunk_index" to index.toString(),
|
||||
"content_length" to chunk.content().length.toString(),
|
||||
)
|
||||
records.add(
|
||||
VectorRecord(
|
||||
id = "${documentId}_chunk_$index",
|
||||
content = chunk.content(),
|
||||
embedding = vector,
|
||||
metadata = metadata,
|
||||
)
|
||||
)
|
||||
}
|
||||
return records
|
||||
}
|
||||
|
||||
fun main() {
|
||||
val records = extractAndVectorize("document.pdf", "doc-001")
|
||||
println("Generated ${records.size} vector records")
|
||||
}
|
||||
```
|
||||
45
docs/snippets/kotlin/ocr/cloud_ocr_backend.md
Normal file
45
docs/snippets/kotlin/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,45 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Path
|
||||
|
||||
class CloudOcrBackend(
|
||||
private val apiKey: String,
|
||||
private val supportedLangs: List<String>,
|
||||
) : IOcrBackend {
|
||||
|
||||
override fun name(): String = "cloud-ocr"
|
||||
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process_image(image_bytes: ByteArray, config: OcrConfig): ExtractionResult {
|
||||
val text = callCloudApi(image_bytes, config.language())
|
||||
return ExtractionResult.builder()
|
||||
.withContent(text)
|
||||
.withMimeType("text/plain")
|
||||
.withMetadata(Metadata.builder().build())
|
||||
.build()
|
||||
}
|
||||
|
||||
override fun process_image_file(path: Path, config: OcrConfig): ExtractionResult {
|
||||
return process_image(java.nio.file.Files.readAllBytes(path), config)
|
||||
}
|
||||
|
||||
override fun supports_language(lang: String): Boolean = supportedLangs.contains(lang)
|
||||
|
||||
override fun backend_type(): OcrBackendType = OcrBackendType.Custom
|
||||
|
||||
override fun supported_languages(): List<String> = supportedLangs
|
||||
|
||||
override fun supports_table_detection(): Boolean = false
|
||||
|
||||
override fun supports_document_processing(): Boolean = false
|
||||
|
||||
override fun process_document(_path: Path, _config: OcrConfig): ExtractionResult {
|
||||
throw UnsupportedOperationException("document processing not supported")
|
||||
}
|
||||
|
||||
private fun callCloudApi(image: ByteArray, language: String): String {
|
||||
return "Extracted text"
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/kotlin/ocr/image_extraction.md
Normal file
18
docs/snippets/kotlin/ocr/image_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val images = ImageExtractionConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withImages(Optional.of(images))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted images: ${result.images()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/ocr/image_preprocessing.md
Normal file
20
docs/snippets/kotlin/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val images = ImageExtractionConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.withTargetDpi(300)
|
||||
.withMaxImageDimension(4096)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withImages(Optional.of(images))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted images: ${result.images()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/ocr/ocr_easyocr.md
Normal file
19
docs/snippets/kotlin/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("easyocr")
|
||||
.withLanguage("en")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted text: ${result.content()}")
|
||||
}
|
||||
```
|
||||
31
docs/snippets/kotlin/ocr/ocr_elements.md
Normal file
31
docs/snippets/kotlin/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val elementConfig = OcrElementConfig.builder()
|
||||
.withIncludeElements(true)
|
||||
.build()
|
||||
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("paddleocr")
|
||||
.withLanguage("en")
|
||||
.withElementConfig(Optional.of(elementConfig))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
|
||||
result.ocrElements()?.forEach { element ->
|
||||
println("Text: ${element.text()}")
|
||||
println("Confidence: ${element.confidence().recognition()}")
|
||||
println("Geometry: ${element.geometry()}")
|
||||
element.rotation()?.let { println("Rotation: ${it}") }
|
||||
println()
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/ocr/ocr_extraction.md
Normal file
19
docs/snippets/kotlin/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/ocr/ocr_force_all_pages.md
Normal file
20
docs/snippets/kotlin/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withForceOcr(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/ocr/ocr_multi_language.md
Normal file
19
docs/snippets/kotlin/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng+deu")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("multilingual.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/ocr/ocr_paddleocr.md
Normal file
19
docs/snippets/kotlin/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("paddleocr")
|
||||
.withLanguage("en")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted text: ${result.content()}")
|
||||
}
|
||||
```
|
||||
14
docs/snippets/kotlin/plugins/clear_plugins.md
Normal file
14
docs/snippets/kotlin/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
fun clearAllPlugins() {
|
||||
// Note: there is no Kreuzberg.clearDocumentExtractors() — extractor
|
||||
// registration is not exposed through the Kotlin/Java plugin bridge.
|
||||
Kreuzberg.clearPostProcessors()
|
||||
Kreuzberg.clearOcrBackends()
|
||||
Kreuzberg.clearValidators()
|
||||
|
||||
println("All post-processors, OCR backends, and validators cleared")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/plugins/embedding_backend.md
Normal file
22
docs/snippets/kotlin/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
// Wrap a host-language embedding model so kreuzberg can call back into it
|
||||
// during chunking and standalone embed requests.
|
||||
class MyEmbedder(private val dim: Long = 768L) : IEmbeddingBackend {
|
||||
override fun name(): String = "my-embedder"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun dimensions(): Long = dim
|
||||
|
||||
override fun embed(texts: List<String>): List<List<Float>> {
|
||||
// Replace this with a real model invocation. Each inner list must
|
||||
// have exactly `dimensions()` elements — the bridge validates shape.
|
||||
return texts.map { List(dim.toInt()) { 0.0f } }
|
||||
}
|
||||
}
|
||||
|
||||
fun registerMyEmbedder() {
|
||||
EmbeddingBackendBridge.registerEmbeddingBackend(MyEmbedder())
|
||||
}
|
||||
```
|
||||
21
docs/snippets/kotlin/plugins/extractor_registration.md
Normal file
21
docs/snippets/kotlin/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
// The Kotlin/Java plugin bridge does not expose an IDocumentExtractor interface
|
||||
// — extractor registration lives in the Rust core. From Kotlin you can list
|
||||
// the extractors that are already registered and route extraction through the
|
||||
// existing facade.
|
||||
fun useRegisteredExtractors() {
|
||||
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
|
||||
println("Available extractors: $extractors")
|
||||
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result: ExtractionResult = Kreuzberg.extractFileSync(
|
||||
java.nio.file.Path.of("document.pdf"),
|
||||
null,
|
||||
config,
|
||||
)
|
||||
println("Extracted ${result.content().length} characters via ${result.mimeType()}")
|
||||
}
|
||||
```
|
||||
18
docs/snippets/kotlin/plugins/list_plugins.md
Normal file
18
docs/snippets/kotlin/plugins/list_plugins.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
|
||||
fun listAllPlugins() {
|
||||
val extractors: List<String> = Kreuzberg.listDocumentExtractors()
|
||||
println("Registered extractors: $extractors")
|
||||
|
||||
val processors: List<String> = Kreuzberg.listPostProcessors()
|
||||
println("Registered post-processors: $processors")
|
||||
|
||||
val backends: List<String> = Kreuzberg.listOcrBackends()
|
||||
println("Registered OCR backends: $backends")
|
||||
|
||||
val validators: List<String> = Kreuzberg.listValidators()
|
||||
println("Registered validators: $validators")
|
||||
}
|
||||
```
|
||||
28
docs/snippets/kotlin/plugins/min_length_validator.md
Normal file
28
docs/snippets/kotlin/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class MinLengthValidator(private val minLength: Int) : IValidator {
|
||||
override fun name(): String = "min-length-validator"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val length = result.content().length
|
||||
if (length < minLength) {
|
||||
throw IllegalStateException(
|
||||
"Content too short: $length < $minLength characters",
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun priority(): Int = 100
|
||||
}
|
||||
|
||||
fun registerMinLengthValidator() {
|
||||
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 100))
|
||||
}
|
||||
```
|
||||
38
docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
Normal file
38
docs/snippets/kotlin/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
class PdfMetadataExtractor : IPostProcessor {
|
||||
private val processed = AtomicInteger(0)
|
||||
|
||||
override fun name(): String = "pdf-metadata-extractor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
if (result.mimeType() != "application/pdf") return
|
||||
|
||||
val count = processed.incrementAndGet()
|
||||
val metadata: Metadata = result.metadata()
|
||||
// Metadata is an immutable record — read PDF metadata fields rather
|
||||
// than mutate. Reporting via stdout/log keeps the snippet honest.
|
||||
println(
|
||||
"[pdf-metadata] #$count title=${metadata.title()} authors=${metadata.authors()}",
|
||||
)
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.mimeType() == "application/pdf"
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 2L
|
||||
|
||||
override fun priority(): Int = 25
|
||||
}
|
||||
|
||||
fun registerPdfMetadataExtractor() {
|
||||
PostProcessorBridge.registerPostProcessor(PdfMetadataExtractor())
|
||||
}
|
||||
```
|
||||
30
docs/snippets/kotlin/plugins/pdf_only_processor.md
Normal file
30
docs/snippets/kotlin/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class PdfOnlyProcessor : IPostProcessor {
|
||||
override fun name(): String = "pdf-only-processor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
// Guard inside process() in addition to should_process() — the gate
|
||||
// saves the JSON roundtrip when this returns false.
|
||||
if (result.mimeType() != "application/pdf") return
|
||||
println("[pdf-only] processing PDF (${result.content().length} chars)")
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.mimeType() == "application/pdf"
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 5L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerPdfOnlyProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(PdfOnlyProcessor())
|
||||
}
|
||||
```
|
||||
33
docs/snippets/kotlin/plugins/plugin_extractor.md
Normal file
33
docs/snippets/kotlin/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
|
||||
// The Kotlin/Java bindings expose plugin bridges for IPostProcessor,
|
||||
// IValidator, IOcrBackend, and IEmbeddingBackend. There is no
|
||||
// IDocumentExtractor bridge — extractor selection happens entirely in the
|
||||
// Rust core based on MIME type. From Kotlin, the "extractor plugin" pattern
|
||||
// is to wrap Kreuzberg.extractBytes / extractFile and dispatch to the right
|
||||
// extractor by MIME.
|
||||
class GenericExtractorClient {
|
||||
suspend fun extractBytes(
|
||||
content: ByteArray,
|
||||
mimeType: String,
|
||||
config: ExtractionConfig = ExtractionConfig.builder().build(),
|
||||
): ExtractionResult = Kreuzberg.extractBytes(content, mimeType, config)
|
||||
|
||||
suspend fun extractFile(
|
||||
path: Path,
|
||||
mimeType: String? = null,
|
||||
config: ExtractionConfig = ExtractionConfig.builder().build(),
|
||||
): ExtractionResult = Kreuzberg.extractFile(path, mimeType, config)
|
||||
}
|
||||
|
||||
suspend fun extractCustomPayload() {
|
||||
val client = GenericExtractorClient()
|
||||
val bytes = Files.readAllBytes(Path.of("payload.json"))
|
||||
val result = client.extractBytes(bytes, mimeType = "application/json")
|
||||
println("Extracted ${result.content().length} chars")
|
||||
}
|
||||
```
|
||||
41
docs/snippets/kotlin/plugins/plugin_logging.md
Normal file
41
docs/snippets/kotlin/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.logging.Logger
|
||||
|
||||
class LoggingPostProcessor : IPostProcessor {
|
||||
private val log: Logger = Logger.getLogger(LoggingPostProcessor::class.java.name)
|
||||
|
||||
override fun name(): String = "logging-post-processor"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
log.info("Initializing plugin: ${name()}")
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
log.info("Shutting down plugin: ${name()}")
|
||||
}
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
log.info("Processing ${result.mimeType()} (${result.content().length} chars)")
|
||||
if (result.content().isEmpty()) {
|
||||
log.warning("Extraction resulted in empty content")
|
||||
}
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Late
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 10
|
||||
}
|
||||
|
||||
fun registerLoggingPostProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(LoggingPostProcessor())
|
||||
}
|
||||
```
|
||||
55
docs/snippets/kotlin/plugins/plugin_testing.md
Normal file
55
docs/snippets/kotlin/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,55 @@
|
||||
<!-- snippet:skip reason="kotlin.test is not on the snippet-runner classpath; the plugin-testing pattern documented here cannot compile under the runner's lightweight Kotlin profile. Run these tests from a real Gradle build." -->
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import dev.kreuzberg.kt.Kreuzberg
|
||||
import kotlin.test.Test
|
||||
import kotlin.test.assertEquals
|
||||
import kotlin.test.assertFailsWith
|
||||
import kotlin.test.assertTrue
|
||||
|
||||
class MinLengthValidatorTest {
|
||||
|
||||
private fun makeResult(content: String): ExtractionResult =
|
||||
ExtractionResult.builder()
|
||||
.content(content)
|
||||
.mimeType("text/plain")
|
||||
.metadata(Metadata.builder().build())
|
||||
.tables(emptyList())
|
||||
.processingWarnings(emptyList())
|
||||
.build()
|
||||
|
||||
@Test
|
||||
fun `validate accepts content above minimum length`() {
|
||||
val validator = MinLengthValidator(minLength = 5)
|
||||
val result = makeResult("hello world")
|
||||
validator.validate(result, ExtractionConfig.builder().build())
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `validate rejects content below minimum length`() {
|
||||
val validator = MinLengthValidator(minLength = 100)
|
||||
val result = makeResult("too short")
|
||||
assertFailsWith<IllegalStateException> {
|
||||
validator.validate(result, ExtractionConfig.builder().build())
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `priority and name are stable`() {
|
||||
val validator = MinLengthValidator(minLength = 1)
|
||||
assertEquals("min-length-validator", validator.name())
|
||||
assertEquals(100, validator.priority())
|
||||
assertTrue(validator.should_validate(makeResult(""), ExtractionConfig.builder().build()))
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `registration round-trip exposes the plugin in the listing`() {
|
||||
ValidatorBridge.registerValidator(MinLengthValidator(minLength = 1))
|
||||
try {
|
||||
assertTrue("min-length-validator" in Kreuzberg.listValidators())
|
||||
} finally {
|
||||
ValidatorBridge.unregisterValidator("min-length-validator")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
44
docs/snippets/kotlin/plugins/plugin_validator.md
Normal file
44
docs/snippets/kotlin/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
// Generic validator pattern: every IValidator has the same shape.
|
||||
// `name()` keys the registry, `priority()` orders execution (higher = earlier),
|
||||
// `should_validate()` is a fast skip-check, and `validate()` throws on failure.
|
||||
class GenericValidator(
|
||||
private val pluginName: String,
|
||||
private val pluginPriority: Int,
|
||||
private val check: (ExtractionResult, ExtractionConfig) -> Unit,
|
||||
) : IValidator {
|
||||
override fun name(): String = pluginName
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
// Optional: open resources, load config files, etc.
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
// Optional: release resources held in initialize().
|
||||
}
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
check(result, config)
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun priority(): Int = pluginPriority
|
||||
}
|
||||
|
||||
fun registerGenericValidator() {
|
||||
val validator = GenericValidator(
|
||||
pluginName = "non-empty-content",
|
||||
pluginPriority = 200,
|
||||
) { result, _ ->
|
||||
require(result.content().isNotBlank()) { "Extracted content is blank" }
|
||||
}
|
||||
ValidatorBridge.registerValidator(validator)
|
||||
}
|
||||
```
|
||||
28
docs/snippets/kotlin/plugins/quality_score_validator.md
Normal file
28
docs/snippets/kotlin/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class QualityScoreValidator(private val threshold: Double = 0.5) : IValidator {
|
||||
override fun name(): String = "quality-score-validator"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun validate(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val score = result.qualityScore() ?: 0.0
|
||||
if (score < threshold) {
|
||||
throw IllegalStateException(
|
||||
"Quality score too low: %.2f < %.2f".format(score, threshold),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override fun should_validate(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.qualityScore() != null
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerQualityScoreValidator() {
|
||||
ValidatorBridge.registerValidator(QualityScoreValidator(threshold = 0.5))
|
||||
}
|
||||
```
|
||||
47
docs/snippets/kotlin/plugins/stateful_plugin.md
Normal file
47
docs/snippets/kotlin/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.concurrent.ConcurrentHashMap
|
||||
import java.util.concurrent.atomic.AtomicLong
|
||||
|
||||
class StatefulPlugin : IPostProcessor {
|
||||
private val callCount = AtomicLong(0)
|
||||
private val cache: ConcurrentHashMap<String, String> = ConcurrentHashMap()
|
||||
|
||||
override fun name(): String = "stateful-plugin"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun initialize() {
|
||||
callCount.set(0)
|
||||
cache.clear()
|
||||
}
|
||||
|
||||
override fun shutdown() {
|
||||
println("Plugin called ${callCount.get()} times")
|
||||
cache.clear()
|
||||
}
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val count = callCount.incrementAndGet()
|
||||
cache["last_mime"] = result.mimeType()
|
||||
cache["last_call"] = count.toString()
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Middle
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = true
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
|
||||
fun callCount(): Long = callCount.get()
|
||||
fun lastMime(): String? = cache["last_mime"]
|
||||
}
|
||||
|
||||
fun registerStatefulPlugin() {
|
||||
PostProcessorBridge.registerPostProcessor(StatefulPlugin())
|
||||
}
|
||||
```
|
||||
12
docs/snippets/kotlin/plugins/unregister_plugins.md
Normal file
12
docs/snippets/kotlin/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
fun unregisterPlugins() {
|
||||
// Each plugin type has a static unregister helper on its bridge class.
|
||||
// The string argument is the name returned by the plugin's name() method.
|
||||
PostProcessorBridge.unregisterPostProcessor("word-count")
|
||||
ValidatorBridge.unregisterValidator("min-length-validator")
|
||||
OcrBackendBridge.unregisterOcrBackend("my-ocr-backend")
|
||||
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder")
|
||||
}
|
||||
```
|
||||
30
docs/snippets/kotlin/plugins/word_count_processor.md
Normal file
30
docs/snippets/kotlin/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
|
||||
class WordCountProcessor : IPostProcessor {
|
||||
override fun name(): String = "word-count"
|
||||
override fun version(): String = "1.0.0"
|
||||
|
||||
override fun process(result: ExtractionResult, config: ExtractionConfig) {
|
||||
val wordCount = result.content().split(Regex("\\s+")).count { it.isNotEmpty() }
|
||||
// ExtractionResult is an immutable record on the Java side; observe
|
||||
// and report rather than mutate.
|
||||
println("[word-count] ${result.mimeType()} -> $wordCount words")
|
||||
}
|
||||
|
||||
override fun processing_stage(): ProcessingStage = ProcessingStage.Early
|
||||
|
||||
override fun should_process(
|
||||
_result: ExtractionResult,
|
||||
_config: ExtractionConfig,
|
||||
): Boolean = _result.content().isNotEmpty()
|
||||
|
||||
override fun estimated_duration_ms(_result: ExtractionResult): Long = 1L
|
||||
|
||||
override fun priority(): Int = 50
|
||||
}
|
||||
|
||||
fun registerWordCountProcessor() {
|
||||
PostProcessorBridge.registerPostProcessor(WordCountProcessor())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/utils/chunking.md
Normal file
19
docs/snippets/kotlin/utils/chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1500L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/kotlin/utils/chunking_rag.md
Normal file
35
docs/snippets/kotlin/utils/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
val metadata = chunk.metadata()
|
||||
println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
|
||||
println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
|
||||
val text = chunk.content()
|
||||
val preview = text.take(100)
|
||||
println("Content: $preview...")
|
||||
chunk.embedding()?.let { vector ->
|
||||
println("Embedding: ${vector.size} dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/kotlin/utils/embedding_with_chunking.md
Normal file
27
docs/snippets/kotlin/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.withShowDownloadProgress(false)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1024L)
|
||||
.withOverlap(100L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/utils/keyword_extraction_example.md
Normal file
22
docs/snippets/kotlin/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.3f)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
result.extractedKeywords()?.let { extracted ->
|
||||
println("Keywords: $extracted")
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/utils/quality_processing_example.md
Normal file
22
docs/snippets/kotlin/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
|
||||
|
||||
val score = result.qualityScore()
|
||||
if (score != null) {
|
||||
if (score < 0.5) {
|
||||
println("Warning: Low quality extraction (%.2f)".format(score))
|
||||
} else {
|
||||
println("Quality score: %.2f".format(score))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/utils/standalone_embed.md
Normal file
17
docs/snippets/kotlin/utils/standalone_embed.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val texts = listOf("Hello, world!", "Kreuzberg is fast")
|
||||
val embeddings = Kreuzberg.embedTexts(texts, config)
|
||||
|
||||
println("Texts embedded: ${embeddings.size}")
|
||||
println("Dimensions: ${embeddings[0].size}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/utils/token_reduction.md
Normal file
19
docs/snippets/kotlin/utils/token_reduction.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/utils/token_reduction_example.md
Normal file
19
docs/snippets/kotlin/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
|
||||
println("Reduced content length: ${result.content().length}")
|
||||
}
|
||||
```
|
||||
52
docs/snippets/kotlin/utils/vector_database_integration.md
Normal file
52
docs/snippets/kotlin/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
data class VectorRecord(
|
||||
val id: String,
|
||||
val content: String,
|
||||
val embedding: List<Float>,
|
||||
val metadata: Map<String, String>
|
||||
)
|
||||
|
||||
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(512L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
|
||||
|
||||
val records = mutableListOf<VectorRecord>()
|
||||
val chunks = result.chunks().orEmpty()
|
||||
for ((index, chunk) in chunks.withIndex()) {
|
||||
val vector = chunk.embedding()
|
||||
if (vector != null) {
|
||||
val metadata = mapOf(
|
||||
"document_id" to documentId,
|
||||
"chunk_index" to index.toString(),
|
||||
"content_length" to chunk.content().length.toString()
|
||||
)
|
||||
records += VectorRecord(
|
||||
id = "${documentId}_chunk_$index",
|
||||
content = chunk.content(),
|
||||
embedding = vector,
|
||||
metadata = metadata
|
||||
)
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user