64 lines
1.9 KiB
Markdown
64 lines
1.9 KiB
Markdown
|
|
```kotlin title="Kotlin"
|
||
|
|
import dev.kreuzberg.*
|
||
|
|
import java.nio.file.Paths
|
||
|
|
import java.util.Optional
|
||
|
|
|
||
|
|
fun main() {
|
||
|
|
val ocr = OcrConfig.builder()
|
||
|
|
.withBackend("tesseract")
|
||
|
|
.withLanguage("eng")
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val embedding = EmbeddingConfig.builder()
|
||
|
|
.withModel(EmbeddingModelType.Preset("balanced"))
|
||
|
|
.withBatchSize(32L)
|
||
|
|
.withNormalize(true)
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val chunking = ChunkingConfig.builder()
|
||
|
|
.withMaxCharacters(1000L)
|
||
|
|
.withOverlap(200L)
|
||
|
|
.withEmbedding(Optional.of(embedding))
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val languageDetection = LanguageDetectionConfig.builder()
|
||
|
|
.withEnabled(true)
|
||
|
|
.withMinConfidence(0.8)
|
||
|
|
.withDetectMultiple(false)
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val keywords = KeywordConfig.builder()
|
||
|
|
.withAlgorithm(KeywordAlgorithm.Yake)
|
||
|
|
.withMaxKeywords(10L)
|
||
|
|
.withMinScore(0.1f)
|
||
|
|
.withNgramRange(listOf(1L, 3L))
|
||
|
|
.withLanguage(Optional.of("en"))
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val tokenReduction = TokenReductionOptions.builder()
|
||
|
|
.withMode("moderate")
|
||
|
|
.withPreserveImportantWords(true)
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val postprocessor = PostProcessorConfig.builder()
|
||
|
|
.withEnabled(true)
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val config = ExtractionConfig.builder()
|
||
|
|
.withUseCache(true)
|
||
|
|
.withEnableQualityProcessing(true)
|
||
|
|
.withOcr(Optional.of(ocr))
|
||
|
|
.withChunking(Optional.of(chunking))
|
||
|
|
.withLanguageDetection(Optional.of(languageDetection))
|
||
|
|
.withKeywords(Optional.of(keywords))
|
||
|
|
.withTokenReduction(Optional.of(tokenReduction))
|
||
|
|
.withPostprocessor(Optional.of(postprocessor))
|
||
|
|
.build()
|
||
|
|
|
||
|
|
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||
|
|
println("Content: ${result.content()}")
|
||
|
|
result.detectedLanguages()?.let { println("Languages: $it") }
|
||
|
|
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||
|
|
}
|
||
|
|
```
|