This commit is contained in:
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withBatchSize(32L)
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.1f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val postprocessor = PostProcessorConfig.builder()
|
||||
.withEnabled(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.withPostprocessor(Optional.of(postprocessor))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Content: ${result.content()}")
|
||||
result.detectedLanguages()?.let { println("Languages: $it") }
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
81
docs/snippets/kotlin/config/chunking_config.md
Normal file
81
docs/snippets/kotlin/config/chunking_config.md
Normal file
@@ -0,0 +1,81 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val chunks = result.chunks().orEmpty()
|
||||
println("Chunks: ${chunks.size}")
|
||||
for (chunk in chunks) {
|
||||
println("Length: ${chunk.content().length}")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Markdown with Heading Context"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val sizing = ChunkSizing.Tokenizer("Xenova/gpt-4o", Optional.empty())
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withChunkerType(ChunkerType.Markdown)
|
||||
.withSizing(sizing)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
chunk.metadata()?.headingContext()?.let { ctx ->
|
||||
for (heading in ctx.headings()) {
|
||||
println("Heading L${heading.level()}: ${heading.text()}")
|
||||
}
|
||||
}
|
||||
val text = chunk.content()
|
||||
println("Content: ${text.take(100)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```kotlin title="Kotlin - Prepend Heading Context"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withChunkerType(ChunkerType.Markdown)
|
||||
.withPrependHeadingContext(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
val text = chunk.content()
|
||||
println("Content: ${text.take(100)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/kotlin/config/config_basic.md
Normal file
15
docs/snippets/kotlin/config/config_basic.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/config/config_discover.md
Normal file
17
docs/snippets/kotlin/config/config_discover.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
// Java/Kotlin bindings construct configuration explicitly via the builder.
|
||||
// Equivalent to ExtractionConfig::discover() in Rust: load defaults and override
|
||||
// any fields you want to override.
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/config/config_ocr.md
Normal file
20
docs/snippets/kotlin/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println("Content length: ${result.content().length}")
|
||||
println("Tables detected: ${result.tables()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
32
docs/snippets/kotlin/config/config_programmatic.md
Normal file
32
docs/snippets/kotlin/config/config_programmatic.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tesseract = TesseractConfig.builder()
|
||||
.withPsm(6)
|
||||
.build()
|
||||
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng+deu")
|
||||
.withTesseractConfig(Optional.of(tesseract))
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Content length: ${result.content().length}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/document_structure_config.md
Normal file
19
docs/snippets/kotlin/config/document_structure_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withIncludeDocumentStructure(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val document = result.document()
|
||||
if (document != null) {
|
||||
for (node in document.nodes()) {
|
||||
println(node)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
32
docs/snippets/kotlin/config/element_based_output.md
Normal file
32
docs/snippets/kotlin/config/element_based_output.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
// Configure element-based output (resultFormat controls Unified vs ElementBased)
|
||||
val config = ExtractionConfig.builder()
|
||||
.withResultFormat(ResultFormat.ElementBased)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val elements = result.elements().orEmpty()
|
||||
for (element in elements) {
|
||||
println("Type: ${element.elementType()}")
|
||||
val text = element.text()
|
||||
println("Text: ${text.take(100)}")
|
||||
|
||||
element.metadata().pageNumber()?.let { page ->
|
||||
println("Page: $page")
|
||||
}
|
||||
println("---")
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
val titles = elements.filter { it.elementType() == ElementType.Title }
|
||||
for (title in titles) {
|
||||
println("Title: ${title.text()}")
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/kotlin/config/embedding_config.md
Normal file
27
docs/snippets/kotlin/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withBatchSize(16L)
|
||||
.withNormalize(true)
|
||||
.withShowDownloadProgress(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/html_output.md
Normal file
19
docs/snippets/kotlin/config/html_output.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val htmlOutput = HtmlOutputConfig.builder()
|
||||
.withTheme(HtmlTheme.GitHub)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOutputFormat(OutputFormat.Html)
|
||||
.withHtmlOutput(Optional.of(htmlOutput))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content()) // HTML with kb-* classes
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/config/keyword_extraction_config.md
Normal file
22
docs/snippets/kotlin/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.1f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Keywords: ${result.extractedKeywords()}")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/kotlin/config/language_detection_config.md
Normal file
20
docs/snippets/kotlin/config/language_detection_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
23
docs/snippets/kotlin/config/ocr_dpi_config.md
Normal file
23
docs/snippets/kotlin/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val images = ImageExtractionConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.withTargetDpi(300)
|
||||
.withMaxImageDimension(4096)
|
||||
.withAutoAdjustDpi(true)
|
||||
.withMinDpi(150)
|
||||
.withMaxDpi(600)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withImages(Optional.of(images))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Extracted images: ${result.images()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/pdf_config.md
Normal file
26
docs/snippets/kotlin/config/pdf_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val hierarchy = HierarchyConfig.builder()
|
||||
.withEnabled(true)
|
||||
.build()
|
||||
|
||||
val pdf = PdfConfig.builder()
|
||||
.withExtractImages(true)
|
||||
.withPasswords(Optional.of(listOf("password123")))
|
||||
.withExtractMetadata(true)
|
||||
.withHierarchy(Optional.of(hierarchy))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPdfOptions(Optional.of(pdf))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("encrypted.pdf"), null, config)
|
||||
println("Title: ${result.metadata().title()}")
|
||||
println("Authors: ${result.metadata().authors()}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/pdf_hierarchy_config.md
Normal file
26
docs/snippets/kotlin/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val hierarchy = HierarchyConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withKClusters(5L)
|
||||
.withIncludeBbox(true)
|
||||
.withOcrCoverageThreshold(Optional.of(0.8f))
|
||||
.build()
|
||||
|
||||
val pdf = PdfConfig.builder()
|
||||
.withHierarchy(Optional.of(hierarchy))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPdfOptions(Optional.of(pdf))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
val pages = result.pages().orEmpty()
|
||||
println("Pages: ${pages.size}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/config/postprocessor_config.md
Normal file
22
docs/snippets/kotlin/config/postprocessor_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val postprocessor = PostProcessorConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withEnabledProcessors(Optional.of(listOf(
|
||||
"whitespace_normalizer",
|
||||
"unicode_normalizer"
|
||||
)))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPostprocessor(Optional.of(postprocessor))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Processed content: ${result.content()}")
|
||||
}
|
||||
```
|
||||
16
docs/snippets/kotlin/config/quality_processing_config.md
Normal file
16
docs/snippets/kotlin/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.withUseCache(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Quality score: ${result.qualityScore()}")
|
||||
println("Warnings: ${result.processingWarnings()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/kotlin/config/tesseract_config.md
Normal file
26
docs/snippets/kotlin/config/tesseract_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tesseract = TesseractConfig.builder()
|
||||
.withLanguage("eng+deu")
|
||||
.withPsm(6)
|
||||
.withOem(3)
|
||||
.build()
|
||||
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng+deu")
|
||||
.withTesseractConfig(Optional.of(tesseract))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(ocr))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
|
||||
println("OCR text: ${result.content()}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/config/token_reduction_config.md
Normal file
19
docs/snippets/kotlin/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Reduced content: ${result.content()}")
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user