Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withBatchSize(32L)
.withNormalize(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.withEmbedding(Optional.of(embedding))
.build()
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(false)
.build()
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.1f)
.withNgramRange(listOf(1L, 3L))
.withLanguage(Optional.of("en"))
.build()
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val postprocessor = PostProcessorConfig.builder()
.withEnabled(true)
.build()
val config = ExtractionConfig.builder()
.withUseCache(true)
.withEnableQualityProcessing(true)
.withOcr(Optional.of(ocr))
.withChunking(Optional.of(chunking))
.withLanguageDetection(Optional.of(languageDetection))
.withKeywords(Optional.of(keywords))
.withTokenReduction(Optional.of(tokenReduction))
.withPostprocessor(Optional.of(postprocessor))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Content: ${result.content()}")
result.detectedLanguages()?.let { println("Languages: $it") }
println("Chunks: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,81 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val chunks = result.chunks().orEmpty()
println("Chunks: ${chunks.size}")
for (chunk in chunks) {
println("Length: ${chunk.content().length}")
}
}
```
```kotlin title="Kotlin - Markdown with Heading Context"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val sizing = ChunkSizing.Tokenizer("Xenova/gpt-4o", Optional.empty())
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withChunkerType(ChunkerType.Markdown)
.withSizing(sizing)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
for (chunk in result.chunks().orEmpty()) {
chunk.metadata()?.headingContext()?.let { ctx ->
for (heading in ctx.headings()) {
println("Heading L${heading.level()}: ${heading.text()}")
}
}
val text = chunk.content()
println("Content: ${text.take(100)}...")
}
}
```
```kotlin title="Kotlin - Prepend Heading Context"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val chunking = ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.withChunkerType(ChunkerType.Markdown)
.withPrependHeadingContext(true)
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.md"), null, config)
for (chunk in result.chunks().orEmpty()) {
// Each chunk's content is prefixed with its heading breadcrumb
val text = chunk.content()
println("Content: ${text.take(100)}...")
}
}
```

View File

@@ -0,0 +1,15 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withUseCache(true)
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,17 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
// Java/Kotlin bindings construct configuration explicitly via the builder.
// Equivalent to ExtractionConfig::discover() in Rust: load defaults and override
// any fields you want to override.
val config = ExtractionConfig.builder()
.withUseCache(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println("Content length: ${result.content().length}")
println("Tables detected: ${result.tables()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tesseract = TesseractConfig.builder()
.withPsm(6)
.build()
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng+deu")
.withTesseractConfig(Optional.of(tesseract))
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.build()
val config = ExtractionConfig.builder()
.withUseCache(true)
.withOcr(Optional.of(ocr))
.withChunking(Optional.of(chunking))
.withEnableQualityProcessing(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Content length: ${result.content().length}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withIncludeDocumentStructure(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val document = result.document()
if (document != null) {
for (node in document.nodes()) {
println(node)
}
}
}
```

View File

@@ -0,0 +1,32 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
// Configure element-based output (resultFormat controls Unified vs ElementBased)
val config = ExtractionConfig.builder()
.withResultFormat(ResultFormat.ElementBased)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val elements = result.elements().orEmpty()
for (element in elements) {
println("Type: ${element.elementType()}")
val text = element.text()
println("Text: ${text.take(100)}")
element.metadata().pageNumber()?.let { page ->
println("Page: $page")
}
println("---")
}
// Filter by element type
val titles = elements.filter { it.elementType() == ElementType.Title }
for (title in titles) {
println("Title: ${title.text()}")
}
}
```

View File

@@ -0,0 +1,27 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val embedding = EmbeddingConfig.builder()
.withModel(EmbeddingModelType.Preset("balanced"))
.withBatchSize(16L)
.withNormalize(true)
.withShowDownloadProgress(true)
.build()
val chunking = ChunkingConfig.builder()
.withMaxCharacters(1000L)
.withOverlap(200L)
.withEmbedding(Optional.of(embedding))
.build()
val config = ExtractionConfig.builder()
.withChunking(Optional.of(chunking))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val htmlOutput = HtmlOutputConfig.builder()
.withTheme(HtmlTheme.GitHub)
.build()
val config = ExtractionConfig.builder()
.withOutputFormat(OutputFormat.Html)
.withHtmlOutput(Optional.of(htmlOutput))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content()) // HTML with kb-* classes
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val keywords = KeywordConfig.builder()
.withAlgorithm(KeywordAlgorithm.Yake)
.withMaxKeywords(10L)
.withMinScore(0.1f)
.withNgramRange(listOf(1L, 3L))
.withLanguage(Optional.of("en"))
.build()
val config = ExtractionConfig.builder()
.withKeywords(Optional.of(keywords))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Keywords: ${result.extractedKeywords()}")
}
```

View File

@@ -0,0 +1,20 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val languageDetection = LanguageDetectionConfig.builder()
.withEnabled(true)
.withMinConfidence(0.8)
.withDetectMultiple(true)
.build()
val config = ExtractionConfig.builder()
.withLanguageDetection(Optional.of(languageDetection))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Detected languages: ${result.detectedLanguages()}")
}
```

View File

@@ -0,0 +1,23 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val images = ImageExtractionConfig.builder()
.withExtractImages(true)
.withTargetDpi(300)
.withMaxImageDimension(4096)
.withAutoAdjustDpi(true)
.withMinDpi(150)
.withMaxDpi(600)
.build()
val config = ExtractionConfig.builder()
.withImages(Optional.of(images))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Extracted images: ${result.images()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val hierarchy = HierarchyConfig.builder()
.withEnabled(true)
.build()
val pdf = PdfConfig.builder()
.withExtractImages(true)
.withPasswords(Optional.of(listOf("password123")))
.withExtractMetadata(true)
.withHierarchy(Optional.of(hierarchy))
.build()
val config = ExtractionConfig.builder()
.withPdfOptions(Optional.of(pdf))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("encrypted.pdf"), null, config)
println("Title: ${result.metadata().title()}")
println("Authors: ${result.metadata().authors()}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val hierarchy = HierarchyConfig.builder()
.withEnabled(true)
.withKClusters(5L)
.withIncludeBbox(true)
.withOcrCoverageThreshold(Optional.of(0.8f))
.build()
val pdf = PdfConfig.builder()
.withHierarchy(Optional.of(hierarchy))
.build()
val config = ExtractionConfig.builder()
.withPdfOptions(Optional.of(pdf))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val pages = result.pages().orEmpty()
println("Pages: ${pages.size}")
}
```

View File

@@ -0,0 +1,22 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val postprocessor = PostProcessorConfig.builder()
.withEnabled(true)
.withEnabledProcessors(Optional.of(listOf(
"whitespace_normalizer",
"unicode_normalizer"
)))
.build()
val config = ExtractionConfig.builder()
.withPostprocessor(Optional.of(postprocessor))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Processed content: ${result.content()}")
}
```

View File

@@ -0,0 +1,16 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder()
.withEnableQualityProcessing(true)
.withUseCache(true)
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Quality score: ${result.qualityScore()}")
println("Warnings: ${result.processingWarnings()?.size ?: 0}")
}
```

View File

@@ -0,0 +1,26 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tesseract = TesseractConfig.builder()
.withLanguage("eng+deu")
.withPsm(6)
.withOem(3)
.build()
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng+deu")
.withTesseractConfig(Optional.of(tesseract))
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println("OCR text: ${result.content()}")
}
```

View File

@@ -0,0 +1,19 @@
```kotlin title="Kotlin"
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val tokenReduction = TokenReductionOptions.builder()
.withMode("moderate")
.withPreserveImportantWords(true)
.build()
val config = ExtractionConfig.builder()
.withTokenReduction(Optional.of(tokenReduction))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println("Reduced content: ${result.content()}")
}
```