This commit is contained in:
19
docs/snippets/kotlin/utils/chunking.md
Normal file
19
docs/snippets/kotlin/utils/chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1500L)
|
||||
.withOverlap(200L)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/kotlin/utils/chunking_rag.md
Normal file
35
docs/snippets/kotlin/utils/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
for (chunk in result.chunks().orEmpty()) {
|
||||
val metadata = chunk.metadata()
|
||||
println("Chunk ${metadata.chunkIndex() + 1}/${metadata.totalChunks()}")
|
||||
println("Position: ${metadata.byteStart()}-${metadata.byteEnd()}")
|
||||
val text = chunk.content()
|
||||
val preview = text.take(100)
|
||||
println("Content: $preview...")
|
||||
chunk.embedding()?.let { vector ->
|
||||
println("Embedding: ${vector.size} dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/kotlin/utils/embedding_with_chunking.md
Normal file
27
docs/snippets/kotlin/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.withShowDownloadProgress(false)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(1024L)
|
||||
.withOverlap(100L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Chunks with embeddings: ${result.chunks()?.size ?: 0}")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/utils/keyword_extraction_example.md
Normal file
22
docs/snippets/kotlin/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val keywords = KeywordConfig.builder()
|
||||
.withAlgorithm(KeywordAlgorithm.Yake)
|
||||
.withMaxKeywords(10L)
|
||||
.withMinScore(0.3f)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withKeywords(Optional.of(keywords))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("research_paper.pdf"), null, config)
|
||||
result.extractedKeywords()?.let { extracted ->
|
||||
println("Keywords: $extracted")
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/kotlin/utils/quality_processing_example.md
Normal file
22
docs/snippets/kotlin/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder()
|
||||
.withEnableQualityProcessing(true)
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("scanned_document.pdf"), null, config)
|
||||
|
||||
val score = result.qualityScore()
|
||||
if (score != null) {
|
||||
if (score < 0.5) {
|
||||
println("Warning: Low quality extraction (%.2f)".format(score))
|
||||
} else {
|
||||
println("Quality score: %.2f".format(score))
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/kotlin/utils/standalone_embed.md
Normal file
17
docs/snippets/kotlin/utils/standalone_embed.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.build()
|
||||
|
||||
val texts = listOf("Hello, world!", "Kreuzberg is fast")
|
||||
val embeddings = Kreuzberg.embedTexts(texts, config)
|
||||
|
||||
println("Texts embedded: ${embeddings.size}")
|
||||
println("Dimensions: ${embeddings[0].size}")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/utils/token_reduction.md
Normal file
19
docs/snippets/kotlin/utils/token_reduction.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println(result.content())
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/utils/token_reduction_example.md
Normal file
19
docs/snippets/kotlin/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val tokenReduction = TokenReductionOptions.builder()
|
||||
.withMode("moderate")
|
||||
.withPreserveImportantWords(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withTokenReduction(Optional.of(tokenReduction))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("verbose_document.pdf"), null, config)
|
||||
println("Reduced content length: ${result.content().length}")
|
||||
}
|
||||
```
|
||||
52
docs/snippets/kotlin/utils/vector_database_integration.md
Normal file
52
docs/snippets/kotlin/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
data class VectorRecord(
|
||||
val id: String,
|
||||
val content: String,
|
||||
val embedding: List<Float>,
|
||||
val metadata: Map<String, String>
|
||||
)
|
||||
|
||||
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(512L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
|
||||
|
||||
val records = mutableListOf<VectorRecord>()
|
||||
val chunks = result.chunks().orEmpty()
|
||||
for ((index, chunk) in chunks.withIndex()) {
|
||||
val vector = chunk.embedding()
|
||||
if (vector != null) {
|
||||
val metadata = mapOf(
|
||||
"document_id" to documentId,
|
||||
"chunk_index" to index.toString(),
|
||||
"content_length" to chunk.content().length.toString()
|
||||
)
|
||||
records += VectorRecord(
|
||||
id = "${documentId}_chunk_$index",
|
||||
content = chunk.content(),
|
||||
embedding = vector,
|
||||
metadata = metadata
|
||||
)
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user