This commit is contained in:
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
63
docs/snippets/kotlin/config/advanced_config.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val ocr = OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguage("eng")
|
||||
.build()
|
||||
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withBatchSize(32L)
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1000L)
|
||||
.withOverlap(200L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.1f)
|
||||
.withNgramRange(listOf(1L, 3L))
|
||||
.withLanguage(Optional.of("en"))
|
||||
.build()
|
||||
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val postprocessor = PostProcessorConfig.builder()
|
||||
.withEnabled(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withUseCache(true)
|
||||
.withEnableQualityProcessing(true)
|
||||
.withOcr(Optional.of(ocr))
|
||||
.withChunking(Optional.of(chunking))
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.withPostprocessor(Optional.of(postprocessor))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Content: ${result.content()}")
|
||||
result.detectedLanguages()?.let { println("Languages: $it") }
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user