This commit is contained in:
27
docs/snippets/java/advanced/ChunkPageMapping.md
Normal file
27
docs/snippets/java/advanced/ChunkPageMapping.md
Normal file
@@ -0,0 +1,27 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
|
||||
var config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkSize(500)
|
||||
.overlap(50)
|
||||
.build())
|
||||
.pages(PageConfig.builder()
|
||||
.extractPages(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
If (result.chunks() != null) {
|
||||
for (var chunk : result.chunks()) {
|
||||
if (chunk.metadata().firstPage() != null) {
|
||||
var pageRange = chunk.metadata().firstPage().equals(chunk.metadata().lastPage())
|
||||
? "Page " + chunk.metadata().firstPage()
|
||||
: "Pages " + chunk.metadata().firstPage() + "-" + chunk.metadata().lastPage();
|
||||
|
||||
System.out.println("Chunk: " + chunk.text().substring(0, 50) +
|
||||
"... (" + pageRange + ")");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
36
docs/snippets/java/advanced/chunk_page_mapping.md
Normal file
36
docs/snippets/java/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.PageConfig;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.build()))
|
||||
.withPages(Optional.of(PageConfig.builder()
|
||||
.withExtractPages(true)
|
||||
.build()))
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
|
||||
if (result.chunks() != null) {
|
||||
for (var chunk : result.chunks()) {
|
||||
Long firstPage = chunk.metadata().firstPage();
|
||||
Long lastPage = chunk.metadata().lastPage();
|
||||
if (firstPage != null && lastPage != null) {
|
||||
String pageRange = firstPage.equals(lastPage)
|
||||
? "Page " + firstPage
|
||||
: "Pages " + firstPage + "-" + lastPage;
|
||||
|
||||
String content = chunk.content();
|
||||
String preview = content.substring(0, Math.min(50, content.length()));
|
||||
System.out.println("Chunk: " + preview + "... (" + pageRange + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/advanced/chunking_config.md
Normal file
18
docs/snippets/java/advanced/chunking_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("all-minilm-l6-v2"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
35
docs/snippets/java/advanced/chunking_rag.md
Normal file
35
docs/snippets/java/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("all-mpnet-base-v2"))
|
||||
.normalize(true)
|
||||
.batchSize(16)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
|
||||
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
System.out.println("Found " + chunks.size() + " chunks for RAG pipeline");
|
||||
|
||||
for (int i = 0; i < Math.min(3, chunks.size()); i++) {
|
||||
Object chunk = chunks.get(i);
|
||||
System.out.println("Chunk " + i + ": " + chunk.toString().substring(0, Math.min(80, chunk.toString().length())) + "...");
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
System.err.println("RAG extraction failed: " + ex.getMessage());
|
||||
}
|
||||
```
|
||||
38
docs/snippets/java/advanced/embedding_with_chunking.md
Normal file
38
docs/snippets/java/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.showDownloadProgress(false)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
for (int index = 0; index < chunks.size(); index++) {
|
||||
Object chunk = chunks.get(index);
|
||||
String chunkId = "doc_chunk_" + index;
|
||||
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
|
||||
|
||||
if (chunk instanceof java.util.Map) {
|
||||
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
|
||||
if (embedding != null) {
|
||||
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/java/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/java/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.KeywordConfig;
|
||||
import dev.kreuzberg.KeywordAlgorithm;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.keywords(KeywordConfig.builder()
|
||||
.algorithm(KeywordAlgorithm.YAKE)
|
||||
.maxKeywords(10)
|
||||
.minScore(0.3)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
30
docs/snippets/java/advanced/keyword_extraction_example.md
Normal file
30
docs/snippets/java/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.KeywordConfig;
|
||||
import dev.kreuzberg.KeywordAlgorithm;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.keywords(KeywordConfig.builder()
|
||||
.algorithm(KeywordAlgorithm.YAKE)
|
||||
.maxKeywords(10)
|
||||
.minScore(0.3)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
|
||||
|
||||
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
|
||||
|
||||
if (metadata.containsKey("keywords")) {
|
||||
List<Map<String, Object>> keywords = (List<Map<String, Object>>) metadata.get("keywords");
|
||||
for (Map<String, Object> kw : keywords) {
|
||||
String text = (String) kw.get("text");
|
||||
Double score = ((Number) kw.get("score")).doubleValue();
|
||||
System.out.println(text + ": " + String.format("%.3f", score));
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/java/advanced/language_detection_config.md
Normal file
13
docs/snippets/java/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
import java.math.BigDecimal;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(new BigDecimal("0.8"))
|
||||
.detectMultiple(false)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
import java.math.BigDecimal;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(new BigDecimal("0.8"))
|
||||
.detectMultiple(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
|
||||
|
||||
List<String> languages = result.getDetectedLanguages() != null
|
||||
? result.getDetectedLanguages()
|
||||
: List.of();
|
||||
|
||||
if (!languages.isEmpty()) {
|
||||
System.out.println("Detected " + languages.size() + " language(s): " + String.join(", ", languages));
|
||||
} else {
|
||||
System.out.println("No languages detected");
|
||||
}
|
||||
|
||||
System.out.println("Total content: " + result.getContent().length() + " characters");
|
||||
System.out.println("MIME type: " + result.getMimeType());
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Processing failed: " + ex.getMessage());
|
||||
}
|
||||
```
|
||||
7
docs/snippets/java/advanced/quality_processing_config.md
Normal file
7
docs/snippets/java/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
```
|
||||
21
docs/snippets/java/advanced/quality_processing_example.md
Normal file
21
docs/snippets/java/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
|
||||
|
||||
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
|
||||
|
||||
if (qualityScore < 0.5) {
|
||||
System.out.println(String.format("Warning: Low quality extraction (%.2f)", qualityScore));
|
||||
System.out.println("Consider re-scanning with higher DPI or adjusting OCR settings");
|
||||
} else {
|
||||
System.out.println(String.format("Quality score: %.2f", qualityScore));
|
||||
}
|
||||
```
|
||||
13
docs/snippets/java/advanced/token_reduction_config.md
Normal file
13
docs/snippets/java/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveMarkdown(true)
|
||||
.preserveCode(true)
|
||||
.languageHint("eng")
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
33
docs/snippets/java/advanced/token_reduction_example.md
Normal file
33
docs/snippets/java/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveMarkdown(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("verbose_document.pdf", config);
|
||||
|
||||
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
|
||||
|
||||
int original = metadata.containsKey("original_token_count")
|
||||
? ((Number) metadata.get("original_token_count")).intValue()
|
||||
: 0;
|
||||
|
||||
int reduced = metadata.containsKey("token_count")
|
||||
? ((Number) metadata.get("token_count")).intValue()
|
||||
: 0;
|
||||
|
||||
double ratio = metadata.containsKey("token_reduction_ratio")
|
||||
? ((Number) metadata.get("token_reduction_ratio")).doubleValue()
|
||||
: 0.0;
|
||||
|
||||
System.out.println("Reduced from " + original + " to " + reduced + " tokens");
|
||||
System.out.println(String.format("Reduction: %.1f%%", ratio * 100));
|
||||
```
|
||||
67
docs/snippets/java/advanced/vector_database_integration.md
Normal file
67
docs/snippets/java/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,67 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class VectorDatabaseIntegration {
|
||||
public static class VectorRecord {
|
||||
public String id;
|
||||
public float[] embedding;
|
||||
public String content;
|
||||
public Map<String, String> metadata;
|
||||
}
|
||||
|
||||
public static List<VectorRecord> extractAndVectorize(String documentPath, String documentId) throws Exception {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile(documentPath, config);
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
|
||||
List<VectorRecord> vectorRecords = new java.util.ArrayList<>();
|
||||
for (int index = 0; index < chunks.size(); index++) {
|
||||
VectorRecord record = new VectorRecord();
|
||||
record.id = documentId + "_chunk_" + index;
|
||||
record.metadata = new HashMap<>();
|
||||
record.metadata.put("document_id", documentId);
|
||||
record.metadata.put("chunk_index", String.valueOf(index));
|
||||
|
||||
if (chunk instanceof java.util.Map) {
|
||||
Map<String, Object> chunkMap = (Map<String, Object>) chunks.get(index);
|
||||
record.content = (String) chunkMap.get("content");
|
||||
record.embedding = (float[]) chunkMap.get("embedding");
|
||||
record.metadata.put("content_length", String.valueOf(record.content.length()));
|
||||
}
|
||||
|
||||
vectorRecords.add(record);
|
||||
}
|
||||
|
||||
storeInVectorDatabase(vectorRecords);
|
||||
return vectorRecords;
|
||||
}
|
||||
|
||||
private static void storeInVectorDatabase(List<VectorRecord> records) {
|
||||
for (VectorRecord record : records) {
|
||||
if (record.embedding != null && record.embedding.length > 0) {
|
||||
System.out.println("Storing " + record.id + ": " + record.content.length()
|
||||
+ " chars, " + record.embedding.length + " dims");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user