This commit is contained in:
20
docs/snippets/kotlin/metadata/language_detection.md
Normal file
20
docs/snippets/kotlin/metadata/language_detection.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.9)
|
||||
.withDetectMultiple(false)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
println("Detected languages: ${result.detectedLanguages()}")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,25 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val languageDetection = LanguageDetectionConfig.builder()
|
||||
.withEnabled(true)
|
||||
.withMinConfidence(0.8)
|
||||
.withDetectMultiple(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withLanguageDetection(Optional.of(languageDetection))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("multilingual_document.pdf"), null, config)
|
||||
|
||||
val detected = result.detectedLanguages() ?: emptyList()
|
||||
println("Detected languages: $detected")
|
||||
for (language in detected) {
|
||||
println(" - $language")
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/kotlin/metadata/metadata.md
Normal file
60
docs/snippets/kotlin/metadata/metadata.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val metadata = result.metadata()
|
||||
metadata.title()?.let { println("Title: $it") }
|
||||
metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }
|
||||
|
||||
// Format-specific metadata via discriminated union
|
||||
metadata.format()?.pdf()?.let { pdf ->
|
||||
pdf.pageCount()?.let { println("Pages: $it") }
|
||||
pdf.producer()?.let { println("Producer: $it") }
|
||||
pdf.pdfVersion()?.let { println("PDF Version: $it") }
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
|
||||
htmlResult.metadata().format()?.html()?.let { html ->
|
||||
html.title()?.let { println("Title: $it") }
|
||||
html.description()?.let { println("Description: $it") }
|
||||
html.canonicalUrl()?.let { println("Canonical URL: $it") }
|
||||
html.language()?.let { println("Language: $it") }
|
||||
|
||||
// Access keywords list
|
||||
println("Keywords: ${html.keywords()}")
|
||||
|
||||
// Open Graph fields are exposed as a Map<String, String>
|
||||
html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
|
||||
html.openGraph()["title"]?.let { println("Open Graph Title: $it") }
|
||||
|
||||
// Twitter Card fields as a Map<String, String>
|
||||
html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }
|
||||
|
||||
// Headers
|
||||
for (header in html.headers()) {
|
||||
println("Header (level ${header.level()}): ${header.text()}")
|
||||
}
|
||||
|
||||
// Links
|
||||
for (link in html.links()) {
|
||||
println("Link: ${link.href()} (${link.text()})")
|
||||
}
|
||||
|
||||
// Images
|
||||
for (image in html.images()) {
|
||||
println("Image: ${image.src()}")
|
||||
}
|
||||
|
||||
// Structured data
|
||||
if (html.structuredData().isNotEmpty()) {
|
||||
println("Structured data items: ${html.structuredData().size}")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/kotlin/metadata/page_boundaries.md
Normal file
24
docs/snippets/kotlin/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val pages = result.metadata().pages() ?: return
|
||||
val boundaries = pages.boundaries() ?: return
|
||||
|
||||
val content = result.content()
|
||||
for (boundary in boundaries.take(3)) {
|
||||
val start = boundary.byteStart().toInt()
|
||||
val end = boundary.byteEnd().toInt()
|
||||
val pageText = content.substring(start, end)
|
||||
val previewEnd = minOf(100, pageText.length)
|
||||
|
||||
println("Page ${boundary.pageNumber()}:")
|
||||
println(" Byte range: $start-$end")
|
||||
println(" Preview: ${pageText.substring(0, previewEnd)}...")
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/kotlin/metadata/page_tracking_basic.md
Normal file
25
docs/snippets/kotlin/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
fun main() {
|
||||
val pageConfig = PageConfig.builder()
|
||||
.withExtractPages(true)
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withPages(Optional.of(pageConfig))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val pages = result.pages() ?: return
|
||||
for (page in pages) {
|
||||
println("Page ${page.pageNumber()}:")
|
||||
println(" Content: ${page.content().length} chars")
|
||||
println(" Tables: ${page.tables().size}")
|
||||
println(" Images: ${page.images().size}")
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/kotlin/metadata/tables.md
Normal file
19
docs/snippets/kotlin/metadata/tables.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
|
||||
fun main() {
|
||||
val config = ExtractionConfig.builder().build()
|
||||
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
|
||||
|
||||
val tables = result.tables() ?: emptyList()
|
||||
for (table in tables) {
|
||||
println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
|
||||
println(table.markdown())
|
||||
|
||||
for (row in table.cells()) {
|
||||
println(row)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
57
docs/snippets/kotlin/metadata/vector_database_integration.md
Normal file
57
docs/snippets/kotlin/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,57 @@
|
||||
```kotlin title="Kotlin"
|
||||
import dev.kreuzberg.*
|
||||
import java.nio.file.Paths
|
||||
import java.util.Optional
|
||||
|
||||
data class VectorRecord(
|
||||
val id: String,
|
||||
val content: String,
|
||||
val embedding: List<Float>,
|
||||
val metadata: Map<String, String>,
|
||||
)
|
||||
|
||||
fun extractAndVectorize(documentPath: String, documentId: String): List<VectorRecord> {
|
||||
val embedding = EmbeddingConfig.builder()
|
||||
.withModel(EmbeddingModelType.Preset("balanced"))
|
||||
.withNormalize(true)
|
||||
.withBatchSize(32L)
|
||||
.build()
|
||||
|
||||
val chunking = ChunkingConfig.builder()
|
||||
.withMaxCharacters(512L)
|
||||
.withOverlap(50L)
|
||||
.withEmbedding(Optional.of(embedding))
|
||||
.build()
|
||||
|
||||
val config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(chunking))
|
||||
.build()
|
||||
|
||||
val result = Kreuzberg.extractFileSync(Paths.get(documentPath), null, config)
|
||||
|
||||
val records = mutableListOf<VectorRecord>()
|
||||
val chunks = result.chunks() ?: return records
|
||||
for ((index, chunk) in chunks.withIndex()) {
|
||||
val vector = chunk.embedding() ?: continue
|
||||
val metadata = mapOf(
|
||||
"document_id" to documentId,
|
||||
"chunk_index" to index.toString(),
|
||||
"content_length" to chunk.content().length.toString(),
|
||||
)
|
||||
records.add(
|
||||
VectorRecord(
|
||||
id = "${documentId}_chunk_$index",
|
||||
content = chunk.content(),
|
||||
embedding = vector,
|
||||
metadata = metadata,
|
||||
)
|
||||
)
|
||||
}
|
||||
return records
|
||||
}
|
||||
|
||||
fun main() {
|
||||
val records = extractAndVectorize("document.pdf", "doc-001")
|
||||
println("Generated ${records.size} vector records")
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user