This commit is contained in:
27
docs/snippets/java/advanced/ChunkPageMapping.md
Normal file
27
docs/snippets/java/advanced/ChunkPageMapping.md
Normal file
@@ -0,0 +1,27 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
|
||||
var config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkSize(500)
|
||||
.overlap(50)
|
||||
.build())
|
||||
.pages(PageConfig.builder()
|
||||
.extractPages(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
If (result.chunks() != null) {
|
||||
for (var chunk : result.chunks()) {
|
||||
if (chunk.metadata().firstPage() != null) {
|
||||
var pageRange = chunk.metadata().firstPage().equals(chunk.metadata().lastPage())
|
||||
? "Page " + chunk.metadata().firstPage()
|
||||
: "Pages " + chunk.metadata().firstPage() + "-" + chunk.metadata().lastPage();
|
||||
|
||||
System.out.println("Chunk: " + chunk.text().substring(0, 50) +
|
||||
"... (" + pageRange + ")");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
36
docs/snippets/java/advanced/chunk_page_mapping.md
Normal file
36
docs/snippets/java/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.PageConfig;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withChunking(Optional.of(ChunkingConfig.builder()
|
||||
.withMaxCharacters(500L)
|
||||
.withOverlap(50L)
|
||||
.build()))
|
||||
.withPages(Optional.of(PageConfig.builder()
|
||||
.withExtractPages(true)
|
||||
.build()))
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
|
||||
if (result.chunks() != null) {
|
||||
for (var chunk : result.chunks()) {
|
||||
Long firstPage = chunk.metadata().firstPage();
|
||||
Long lastPage = chunk.metadata().lastPage();
|
||||
if (firstPage != null && lastPage != null) {
|
||||
String pageRange = firstPage.equals(lastPage)
|
||||
? "Page " + firstPage
|
||||
: "Pages " + firstPage + "-" + lastPage;
|
||||
|
||||
String content = chunk.content();
|
||||
String preview = content.substring(0, Math.min(50, content.length()));
|
||||
System.out.println("Chunk: " + preview + "... (" + pageRange + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/advanced/chunking_config.md
Normal file
18
docs/snippets/java/advanced/chunking_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("all-minilm-l6-v2"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
35
docs/snippets/java/advanced/chunking_rag.md
Normal file
35
docs/snippets/java/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("all-mpnet-base-v2"))
|
||||
.normalize(true)
|
||||
.batchSize(16)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
|
||||
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
System.out.println("Found " + chunks.size() + " chunks for RAG pipeline");
|
||||
|
||||
for (int i = 0; i < Math.min(3, chunks.size()); i++) {
|
||||
Object chunk = chunks.get(i);
|
||||
System.out.println("Chunk " + i + ": " + chunk.toString().substring(0, Math.min(80, chunk.toString().length())) + "...");
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
System.err.println("RAG extraction failed: " + ex.getMessage());
|
||||
}
|
||||
```
|
||||
38
docs/snippets/java/advanced/embedding_with_chunking.md
Normal file
38
docs/snippets/java/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.showDownloadProgress(false)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
for (int index = 0; index < chunks.size(); index++) {
|
||||
Object chunk = chunks.get(index);
|
||||
String chunkId = "doc_chunk_" + index;
|
||||
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
|
||||
|
||||
if (chunk instanceof java.util.Map) {
|
||||
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
|
||||
if (embedding != null) {
|
||||
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/java/advanced/keyword_extraction_config.md
Normal file
15
docs/snippets/java/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.KeywordConfig;
|
||||
import dev.kreuzberg.KeywordAlgorithm;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.keywords(KeywordConfig.builder()
|
||||
.algorithm(KeywordAlgorithm.YAKE)
|
||||
.maxKeywords(10)
|
||||
.minScore(0.3)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
30
docs/snippets/java/advanced/keyword_extraction_example.md
Normal file
30
docs/snippets/java/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.KeywordConfig;
|
||||
import dev.kreuzberg.KeywordAlgorithm;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.keywords(KeywordConfig.builder()
|
||||
.algorithm(KeywordAlgorithm.YAKE)
|
||||
.maxKeywords(10)
|
||||
.minScore(0.3)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
|
||||
|
||||
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
|
||||
|
||||
if (metadata.containsKey("keywords")) {
|
||||
List<Map<String, Object>> keywords = (List<Map<String, Object>>) metadata.get("keywords");
|
||||
for (Map<String, Object> kw : keywords) {
|
||||
String text = (String) kw.get("text");
|
||||
Double score = ((Number) kw.get("score")).doubleValue();
|
||||
System.out.println(text + ": " + String.format("%.3f", score));
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/java/advanced/language_detection_config.md
Normal file
13
docs/snippets/java/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
import java.math.BigDecimal;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(new BigDecimal("0.8"))
|
||||
.detectMultiple(false)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
import java.math.BigDecimal;
|
||||
import java.util.List;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(new BigDecimal("0.8"))
|
||||
.detectMultiple(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
|
||||
|
||||
List<String> languages = result.getDetectedLanguages() != null
|
||||
? result.getDetectedLanguages()
|
||||
: List.of();
|
||||
|
||||
if (!languages.isEmpty()) {
|
||||
System.out.println("Detected " + languages.size() + " language(s): " + String.join(", ", languages));
|
||||
} else {
|
||||
System.out.println("No languages detected");
|
||||
}
|
||||
|
||||
System.out.println("Total content: " + result.getContent().length() + " characters");
|
||||
System.out.println("MIME type: " + result.getMimeType());
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Processing failed: " + ex.getMessage());
|
||||
}
|
||||
```
|
||||
7
docs/snippets/java/advanced/quality_processing_config.md
Normal file
7
docs/snippets/java/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
```
|
||||
21
docs/snippets/java/advanced/quality_processing_example.md
Normal file
21
docs/snippets/java/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
|
||||
|
||||
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
|
||||
|
||||
if (qualityScore < 0.5) {
|
||||
System.out.println(String.format("Warning: Low quality extraction (%.2f)", qualityScore));
|
||||
System.out.println("Consider re-scanning with higher DPI or adjusting OCR settings");
|
||||
} else {
|
||||
System.out.println(String.format("Quality score: %.2f", qualityScore));
|
||||
}
|
||||
```
|
||||
13
docs/snippets/java/advanced/token_reduction_config.md
Normal file
13
docs/snippets/java/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveMarkdown(true)
|
||||
.preserveCode(true)
|
||||
.languageHint("eng")
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
33
docs/snippets/java/advanced/token_reduction_example.md
Normal file
33
docs/snippets/java/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
import java.util.Map;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveMarkdown(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("verbose_document.pdf", config);
|
||||
|
||||
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
|
||||
|
||||
int original = metadata.containsKey("original_token_count")
|
||||
? ((Number) metadata.get("original_token_count")).intValue()
|
||||
: 0;
|
||||
|
||||
int reduced = metadata.containsKey("token_count")
|
||||
? ((Number) metadata.get("token_count")).intValue()
|
||||
: 0;
|
||||
|
||||
double ratio = metadata.containsKey("token_reduction_ratio")
|
||||
? ((Number) metadata.get("token_reduction_ratio")).doubleValue()
|
||||
: 0.0;
|
||||
|
||||
System.out.println("Reduced from " + original + " to " + reduced + " tokens");
|
||||
System.out.println(String.format("Reduction: %.1f%%", ratio * 100));
|
||||
```
|
||||
67
docs/snippets/java/advanced/vector_database_integration.md
Normal file
67
docs/snippets/java/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,67 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class VectorDatabaseIntegration {
|
||||
public static class VectorRecord {
|
||||
public String id;
|
||||
public float[] embedding;
|
||||
public String content;
|
||||
public Map<String, String> metadata;
|
||||
}
|
||||
|
||||
public static List<VectorRecord> extractAndVectorize(String documentPath, String documentId) throws Exception {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.normalize(true)
|
||||
.batchSize(32)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile(documentPath, config);
|
||||
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
|
||||
|
||||
List<VectorRecord> vectorRecords = new java.util.ArrayList<>();
|
||||
for (int index = 0; index < chunks.size(); index++) {
|
||||
VectorRecord record = new VectorRecord();
|
||||
record.id = documentId + "_chunk_" + index;
|
||||
record.metadata = new HashMap<>();
|
||||
record.metadata.put("document_id", documentId);
|
||||
record.metadata.put("chunk_index", String.valueOf(index));
|
||||
|
||||
if (chunk instanceof java.util.Map) {
|
||||
Map<String, Object> chunkMap = (Map<String, Object>) chunks.get(index);
|
||||
record.content = (String) chunkMap.get("content");
|
||||
record.embedding = (float[]) chunkMap.get("embedding");
|
||||
record.metadata.put("content_length", String.valueOf(record.content.length()));
|
||||
}
|
||||
|
||||
vectorRecords.add(record);
|
||||
}
|
||||
|
||||
storeInVectorDatabase(vectorRecords);
|
||||
return vectorRecords;
|
||||
}
|
||||
|
||||
private static void storeInVectorDatabase(List<VectorRecord> records) {
|
||||
for (VectorRecord record : records) {
|
||||
if (record.embedding != null && record.embedding.length > 0) {
|
||||
System.out.println("Storing " + record.id + ": " + record.content.length()
|
||||
+ " chars, " + record.embedding.length + " dims");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/java/api/batch_extract_bytes_sync.md
Normal file
22
docs/snippets/java/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.BatchBytesItem;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
byte[] doc1 = Files.readAllBytes(Paths.get("doc1.pdf"));
|
||||
byte[] doc2 = Files.readAllBytes(Paths.get("doc2.docx"));
|
||||
|
||||
List<BatchBytesItem> items = Arrays.asList(
|
||||
new BatchBytesItem(doc1, "application/pdf", null),
|
||||
new BatchBytesItem(doc2, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", null)
|
||||
);
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
List<ExtractionResult> results = Kreuzberg.batchExtractBytesSync(items, config);
|
||||
System.out.println("Processed " + results.size() + " documents");
|
||||
```
|
||||
22
docs/snippets/java/api/batch_extract_files_sync.md
Normal file
22
docs/snippets/java/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.BatchFileItem;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
List<BatchFileItem> items = Arrays.asList(
|
||||
new BatchFileItem(Paths.get("doc1.pdf"), null),
|
||||
new BatchFileItem(Paths.get("doc2.docx"), null),
|
||||
new BatchFileItem(Paths.get("doc3.pptx"), null)
|
||||
);
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
List<ExtractionResult> results = Kreuzberg.batchExtractFilesSync(items, config);
|
||||
|
||||
for (ExtractionResult result : results) {
|
||||
System.out.println("Content length: " + result.content().length());
|
||||
}
|
||||
```
|
||||
30
docs/snippets/java/api/client_chunk_text.md
Normal file
30
docs/snippets/java/api/client_chunk_text.md
Normal file
@@ -0,0 +1,30 @@
|
||||
<!-- snippet:skip -->
|
||||
```java title="Java"
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.URI;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import java.util.List;
|
||||
|
||||
record ChunkRequest(String text, @JsonProperty("chunker_type") String chunkerType, ChunkConfig config) {}
|
||||
record ChunkConfig(@JsonProperty("max_characters") int maxCharacters, int overlap, boolean trim) {}
|
||||
record ChunkItem(String content, @JsonProperty("byte_start") int byteStart, @JsonProperty("chunk_index") int chunkIndex) {}
|
||||
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
ChunkRequest req = new ChunkRequest("Your long text here...", "text", new ChunkConfig(1000, 50, true));
|
||||
String json = mapper.writeValueAsString(req);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://localhost:8000/chunk"))
|
||||
.header("Content-Type", "application/json")
|
||||
.POST(HttpRequest.BodyPublishers.ofString(json))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
var result = mapper.readTree(response.body());
|
||||
System.out.println("Created " + result.get("chunk_count").asInt() + " chunks");
|
||||
```
|
||||
22
docs/snippets/java/api/client_extract_single_file.md
Normal file
22
docs/snippets/java/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```java title="Java"
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
|
||||
try (var fileStream = Files.newInputStream(Paths.get("document.pdf"))) {
|
||||
byte[] content = fileStream.readAllBytes();
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://localhost:8000/extract"))
|
||||
.header("Content-Type", "application/octet-stream")
|
||||
.POST(HttpRequest.BodyPublishers.ofByteArray(content))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
System.out.println(response.body());
|
||||
}
|
||||
```
|
||||
28
docs/snippets/java/api/combining_all_features.md
Normal file
28
docs/snippets/java/api/combining_all_features.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.*;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Optional;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withOcr(Optional.of(OcrConfig.builder()
|
||||
.withBackend("tesseract")
|
||||
.withLanguages(Optional.of(java.util.List.of("eng", "deu")))
|
||||
.build()))
|
||||
.withChunking(Optional.of(ChunkingConfig.builder()
|
||||
.withMaxChars(Optional.of(512L))
|
||||
.withMaxOverlap(Optional.of(50L))
|
||||
.build()))
|
||||
.withEnableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);
|
||||
System.out.println("Content: " + result.content().substring(0, 100) + "...");
|
||||
if (result.tables() != null) {
|
||||
System.out.println("Tables: " + result.tables().size());
|
||||
}
|
||||
if (result.qualityScore() != null) {
|
||||
System.out.println("Quality: " + result.qualityScore());
|
||||
}
|
||||
```
|
||||
16
docs/snippets/java/api/error_handling.md
Normal file
16
docs/snippets/java/api/error_handling.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.KreuzbergRsException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("missing.pdf"), config);
|
||||
System.out.println(result.content());
|
||||
} catch (KreuzbergRsException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
System.err.println("Error code: " + e.getCode());
|
||||
}
|
||||
```
|
||||
28
docs/snippets/java/api/error_handling_extract.md
Normal file
28
docs/snippets/java/api/error_handling_extract.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```java title="Java"
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
byte[] fileBytes = Files.readAllBytes(Paths.get("document.pdf"));
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("http://localhost:8000/extract"))
|
||||
.header("Content-Type", "application/octet-stream")
|
||||
.POST(HttpRequest.BodyPublishers.ofByteArray(fileBytes))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (response.statusCode() != 200) {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
var error = mapper.readTree(response.body());
|
||||
System.err.println("Error: " + error.get("error_type").asText() + " - " + error.get("message").asText());
|
||||
} else {
|
||||
System.out.println("Success: " + response.body());
|
||||
}
|
||||
```
|
||||
14
docs/snippets/java/api/extract_bytes_async.md
Normal file
14
docs/snippets/java/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
ExtractionResult result = Kreuzberg.extractBytes(data, "application/pdf", config);
|
||||
|
||||
System.out.println(result.content());
|
||||
System.out.println(result.mimeType());
|
||||
```
|
||||
14
docs/snippets/java/api/extract_bytes_sync.md
Normal file
14
docs/snippets/java/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
ExtractionResult result = Kreuzberg.extractBytesSync(data, "application/pdf", config);
|
||||
|
||||
System.out.println(result.content());
|
||||
System.out.println(result.mimeType());
|
||||
```
|
||||
12
docs/snippets/java/api/extract_file_async.md
Normal file
12
docs/snippets/java/api/extract_file_async.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
ExtractionResult result = Kreuzberg.extractFile(Paths.get("document.pdf"), config);
|
||||
|
||||
System.out.println(result.content());
|
||||
System.out.println(result.mimeType());
|
||||
```
|
||||
13
docs/snippets/java/api/extract_file_sync.md
Normal file
13
docs/snippets/java/api/extract_file_sync.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder().build();
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);
|
||||
|
||||
System.out.println(result.content());
|
||||
System.out.println("Tables: " + (result.tables() != null ? result.tables().size() : 0));
|
||||
System.out.println("Metadata: " + result.metadata());
|
||||
```
|
||||
62
docs/snippets/java/benchmarking/SimpleBenchmark.java
Normal file
62
docs/snippets/java/benchmarking/SimpleBenchmark.java
Normal file
@@ -0,0 +1,62 @@
|
||||
```java title="SimpleBenchmark.java"
|
||||
import com.kreuzberg.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
public final class SimpleBenchmark {
|
||||
private SimpleBenchmark() {}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.useCache(false)
|
||||
.build();
|
||||
|
||||
Kreuzberg kreuzberg = new Kreuzberg(config);
|
||||
String filePath = "document.pdf";
|
||||
int numRuns = 10;
|
||||
|
||||
System.out.println("Sync extraction (" + numRuns + " runs):");
|
||||
long start = System.nanoTime();
|
||||
for (int i = 0; i < numRuns; i++) {
|
||||
kreuzberg.extractFile(filePath);
|
||||
}
|
||||
double syncDuration = (System.nanoTime() - start) / 1_000_000_000.0;
|
||||
double avgSync = syncDuration / numRuns;
|
||||
System.out.println(" - Total time: " + String.format("%.3f", syncDuration) + "s");
|
||||
System.out.println(" - Average: " + String.format("%.3f", avgSync) + "s per extraction");
|
||||
|
||||
System.out.println("\nAsync extraction (" + numRuns + " parallel runs):");
|
||||
List<Callable<ExtractionResult>> tasks = new ArrayList<>();
|
||||
for (int i = 0; i < numRuns; i++) {
|
||||
tasks.add(() -> kreuzberg.extractFile(filePath));
|
||||
}
|
||||
|
||||
start = System.nanoTime();
|
||||
ForkJoinPool.commonPool().invokeAll(tasks);
|
||||
double asyncDuration = (System.nanoTime() - start) / 1_000_000_000.0;
|
||||
System.out.println(" - Total time: " + String.format("%.3f", asyncDuration) + "s");
|
||||
System.out.println(" - Average: " + String.format("%.3f", asyncDuration / numRuns) + "s per extraction");
|
||||
System.out.println(" - Speedup: " + String.format("%.1f", syncDuration / asyncDuration) + "x");
|
||||
|
||||
ExtractionConfig cacheConfig = new ExtractionConfig.Builder()
|
||||
.useCache(true)
|
||||
.build();
|
||||
Kreuzberg kreuzbergCached = new Kreuzberg(cacheConfig);
|
||||
|
||||
System.out.println("\nFirst extraction (populates cache)...");
|
||||
start = System.nanoTime();
|
||||
kreuzbergCached.extractFile(filePath);
|
||||
double firstDuration = (System.nanoTime() - start) / 1_000_000_000.0;
|
||||
System.out.println(" - Time: " + String.format("%.3f", firstDuration) + "s");
|
||||
|
||||
System.out.println("Second extraction (from cache)...");
|
||||
start = System.nanoTime();
|
||||
kreuzbergCached.extractFile(filePath);
|
||||
double cachedDuration = (System.nanoTime() - start) / 1_000_000_000.0;
|
||||
System.out.println(" - Time: " + String.format("%.3f", cachedDuration) + "s");
|
||||
System.out.println(" - Cache speedup: " + String.format("%.1f", firstDuration / cachedDuration) + "x");
|
||||
}
|
||||
}
|
||||
```
|
||||
46
docs/snippets/java/cache/DiskCache.java
vendored
Normal file
46
docs/snippets/java/cache/DiskCache.java
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
```java title="DiskCache.java"
|
||||
import com.kreuzberg.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public final class DiskCache {
|
||||
private DiskCache() {}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String cacheDir = System.getProperty("user.home") + "/.cache/kreuzberg";
|
||||
Files.createDirectories(Paths.get(cacheDir));
|
||||
|
||||
CacheConfig cacheConfig = new CacheConfig(
|
||||
cacheDir,
|
||||
500L * 1024 * 1024,
|
||||
7L * 86400,
|
||||
true
|
||||
);
|
||||
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.useCache(true)
|
||||
.cacheConfig(cacheConfig)
|
||||
.build();
|
||||
|
||||
Kreuzberg kreuzberg = new Kreuzberg(config);
|
||||
|
||||
System.out.println("First extraction (will be cached)...");
|
||||
ExtractionResult result1 = kreuzberg.extractFile("document.pdf");
|
||||
System.out.println(" - Content length: " + result1.content().length());
|
||||
System.out.println(" - Cached: " + result1.metadata().wasCached());
|
||||
|
||||
System.out.println("\nSecond extraction (from cache)...");
|
||||
ExtractionResult result2 = kreuzberg.extractFile("document.pdf");
|
||||
System.out.println(" - Content length: " + result2.content().length());
|
||||
System.out.println(" - Cached: " + result2.metadata().wasCached());
|
||||
|
||||
System.out.println("\nResults are identical: " + result1.content().equals(result2.content()));
|
||||
|
||||
CacheStats cacheStats = kreuzberg.getCacheStats();
|
||||
System.out.println("\nCache Statistics:");
|
||||
System.out.println(" - Total entries: " + cacheStats.totalEntries());
|
||||
System.out.println(" - Cache size: " + String.format("%.1f", cacheStats.cacheSizeBytes() / 1024.0 / 1024.0) + " MB");
|
||||
System.out.println(" - Hit rate: " + String.format("%.1f", cacheStats.hitRate() * 100) + "%");
|
||||
}
|
||||
}
|
||||
```
|
||||
41
docs/snippets/java/cli/BasicCli.java
Normal file
41
docs/snippets/java/cli/BasicCli.java
Normal file
@@ -0,0 +1,41 @@
|
||||
```java title="BasicCli.java"
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public final class BasicCli {
|
||||
private BasicCli() {}
|
||||
|
||||
public static String extractWithCli(String filePath, String outputFormat) throws IOException, InterruptedException {
|
||||
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "extract", filePath, "--format", outputFormat);
|
||||
pb.redirectErrorStream(true);
|
||||
|
||||
Process process = pb.start();
|
||||
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line).append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
throw new RuntimeException("CLI exited with code " + exitCode + ": " + output);
|
||||
}
|
||||
|
||||
return output.toString().trim();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
String document = "document.pdf";
|
||||
|
||||
String textOutput = extractWithCli(document, "text");
|
||||
System.out.println("Extracted: " + textOutput.length() + " characters");
|
||||
|
||||
String jsonOutput = extractWithCli(document, "json");
|
||||
System.out.println("JSON output received: " + jsonOutput.length() + " bytes");
|
||||
}
|
||||
}
|
||||
```
|
||||
56
docs/snippets/java/cli/CliWithConfig.java
Normal file
56
docs/snippets/java/cli/CliWithConfig.java
Normal file
@@ -0,0 +1,56 @@
|
||||
```java title="CliWithConfig.java"
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public final class CliWithConfig {
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
private CliWithConfig() {}
|
||||
|
||||
public static JsonNode extractWithConfig(String filePath, String configPath)
|
||||
throws IOException, InterruptedException {
|
||||
ProcessBuilder pb = new ProcessBuilder(
|
||||
"kreuzberg",
|
||||
"extract",
|
||||
filePath,
|
||||
"--config",
|
||||
configPath,
|
||||
"--format",
|
||||
"json");
|
||||
pb.redirectErrorStream(true);
|
||||
|
||||
Process process = pb.start();
|
||||
|
||||
StringBuilder output = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
output.append(line);
|
||||
}
|
||||
}
|
||||
|
||||
int exitCode = process.waitFor();
|
||||
if (exitCode != 0) {
|
||||
throw new RuntimeException("CLI exited with code " + exitCode + ": " + output);
|
||||
}
|
||||
|
||||
return MAPPER.readTree(output.toString());
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
String configFile = "kreuzberg.toml";
|
||||
String document = "document.pdf";
|
||||
|
||||
System.out.println("Extracting " + document + " with config " + configFile);
|
||||
JsonNode result = extractWithConfig(document, configFile);
|
||||
|
||||
String content = result.get("content").asText();
|
||||
System.out.println("Content length: " + content.length());
|
||||
System.out.println("Format: " + result.get("format").asText());
|
||||
System.out.println("Languages: " + result.get("languages").toString());
|
||||
}
|
||||
}
|
||||
```
|
||||
52
docs/snippets/java/config/ElementBasedOutput.md
Normal file
52
docs/snippets/java/config/ElementBasedOutput.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```java title="Element-Based Output (Java)"
|
||||
import io.kreuzberg.Kreuzberg;
|
||||
import io.kreuzberg.ExtractionConfig;
|
||||
import io.kreuzberg.ExtractionResult;
|
||||
import io.kreuzberg.Element;
|
||||
import io.kreuzberg.OutputFormat;
|
||||
|
||||
public class ElementBasedOutput {
|
||||
public static void main(String[] args) {
|
||||
// Configure element-based output
|
||||
ExtractionConfig config = new ExtractionConfig();
|
||||
config.setOutputFormat(OutputFormat.ELEMENT_BASED);
|
||||
|
||||
// Extract document
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
// Access elements
|
||||
for (Element element : result.getElements()) {
|
||||
System.out.println("Type: " + element.getElementType());
|
||||
|
||||
String text = element.getText();
|
||||
if (text.length() > 100) {
|
||||
text = text.substring(0, 100);
|
||||
}
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
if (element.getMetadata().getPageNumber() != null) {
|
||||
System.out.println("Page: " + element.getMetadata().getPageNumber());
|
||||
}
|
||||
|
||||
if (element.getMetadata().getCoordinates() != null) {
|
||||
var coords = element.getMetadata().getCoordinates();
|
||||
System.out.printf("Coords: (%f, %f) - (%f, %f)%n",
|
||||
coords.getLeft(), coords.getTop(),
|
||||
coords.getRight(), coords.getBottom());
|
||||
}
|
||||
|
||||
System.out.println("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
result.getElements().stream()
|
||||
.filter(e -> "title".equals(e.getElementType()))
|
||||
.forEach(title -> {
|
||||
String level = (String) title.getMetadata()
|
||||
.getAdditional()
|
||||
.getOrDefault("level", "unknown");
|
||||
System.out.printf("[%s] %s%n", level, title.getText());
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
41
docs/snippets/java/config/advanced_config.md
Normal file
41
docs/snippets/java/config/advanced_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.*;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu")
|
||||
.build())
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(100)
|
||||
.build())
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveImportantWords(true)
|
||||
.build())
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.build())
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
if (!result.getDetectedLanguages().isEmpty()) {
|
||||
System.out.println("Languages: " + result.getDetectedLanguages());
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/java/config/chunking_config.md
Normal file
60
docs/snippets/java/config/chunking_config.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
|
||||
```java title="Java - Markdown with Heading Context"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.HeadingContext;
|
||||
import dev.kreuzberg.HeadingLevel;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkerType("markdown")
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.sizingTokenizer("Xenova/gpt-4o")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
|
||||
|
||||
result.getChunks().forEach(chunk -> {
|
||||
var headingContext = chunk.getMetadata().getHeadingContext();
|
||||
if (headingContext.isPresent()) {
|
||||
System.out.println("Headings:");
|
||||
headingContext.get().getHeadings().forEach(heading ->
|
||||
System.out.println(" Level " + heading.getLevel() + ": " + heading.getText())
|
||||
);
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
```java title="Java - Prepend Heading Context"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkerType("markdown")
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.prependHeadingContext(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
|
||||
|
||||
result.getChunks().forEach(chunk -> {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())));
|
||||
});
|
||||
```
|
||||
11
docs/snippets/java/config/config_basic.md
Normal file
11
docs/snippets/java/config/config_basic.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
```
|
||||
8
docs/snippets/java/config/config_discover.md
Normal file
8
docs/snippets/java/config/config_discover.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
```
|
||||
14
docs/snippets/java/config/config_file.md
Normal file
14
docs/snippets/java/config/config_file.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public final class ConfigFileExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionConfig config = Kreuzberg.loadExtractionConfigFromFile(Path.of("kreuzberg.toml"));
|
||||
ExtractionResult result = Kreuzberg.extractFile(Path.of("document.pdf"), config);
|
||||
System.out.printf("Detected MIME: %s%n", result.getMimeType());
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/java/config/config_ocr.md
Normal file
15
docs/snippets/java/config/config_ocr.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+fra")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(3)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
31
docs/snippets/java/config/config_programmatic.md
Normal file
31
docs/snippets/java/config/config_programmatic.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
public final class ProgrammaticConfigExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(6)
|
||||
.build())
|
||||
.build())
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.build())
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.printf("Content length: %d%n", result.getContent().length());
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/config/document_structure_config.md
Normal file
18
docs/snippets/java/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Document Structure Config (Java)"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.includeDocumentStructure(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
if (result.getDocumentStructure().isPresent()) {
|
||||
var document = result.getDocumentStructure().get();
|
||||
for (var node : document.nodes()) {
|
||||
System.out.println("[" + node.content().nodeType() + "]");
|
||||
}
|
||||
}
|
||||
```
|
||||
53
docs/snippets/java/config/element_based_output.md
Normal file
53
docs/snippets/java/config/element_based_output.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```java title="Element-Based Output (Java)"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Element;
|
||||
import dev.kreuzberg.ResultFormat;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class ElementBasedOutput {
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Configure element-based output
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withResultFormat(ResultFormat.ElementBased)
|
||||
.build();
|
||||
|
||||
// Extract document
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
|
||||
// Access elements
|
||||
List<Element> elements = result.elements();
|
||||
if (elements != null) {
|
||||
for (Element element : elements) {
|
||||
System.out.println("Type: " + element.elementType());
|
||||
|
||||
String text = element.text();
|
||||
if (text.length() > 100) {
|
||||
text = text.substring(0, 100);
|
||||
}
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
if (element.metadata().pageNumber() != null) {
|
||||
System.out.println("Page: " + element.metadata().pageNumber());
|
||||
}
|
||||
|
||||
if (element.metadata().coordinates() != null) {
|
||||
System.out.println("Coords: " + element.metadata().coordinates());
|
||||
}
|
||||
|
||||
System.out.println("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
elements.stream()
|
||||
.filter(e -> "Title".equalsIgnoreCase(String.valueOf(e.elementType())))
|
||||
.forEach(title -> {
|
||||
String level = title.metadata().additional().getOrDefault("level", "unknown");
|
||||
System.out.printf("[%s] %s%n", level, title.text());
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
87
docs/snippets/java/config/embedding_config.java
Normal file
87
docs/snippets/java/config/embedding_config.java
Normal file
@@ -0,0 +1,87 @@
|
||||
import kreuzberg.config.EmbeddingConfig;
|
||||
import kreuzberg.config.EmbeddingModelType;
|
||||
import kreuzberg.config.ChunkingConfig;
|
||||
import kreuzberg.config.ExtractionConfig;
|
||||
|
||||
public class EmbeddingConfigExample {
|
||||
public static void main(String[] args) {
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
EmbeddingConfig embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir("~/.cache/kreuzberg/embeddings")
|
||||
.build();
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir(null) // Uses default: .kreuzberg/embeddings/
|
||||
.build();
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768))
|
||||
.batchSize(16) // Larger model requires smaller batch size
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir("/var/cache/embeddings")
|
||||
.build();
|
||||
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
ChunkingConfig chunkingConfig = ChunkingConfig.builder()
|
||||
.maxChars(1024)
|
||||
.maxOverlap(100)
|
||||
.preset("balanced")
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfig = ExtractionConfig.builder()
|
||||
.chunking(chunkingConfig)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// batchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// cacheDir: Where to store downloaded models
|
||||
// - null: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - String path: Custom directory for model storage
|
||||
//
|
||||
// showDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
21
docs/snippets/java/config/embedding_config.md
Normal file
21
docs/snippets/java/config/embedding_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.builder()
|
||||
.type("preset")
|
||||
.name("all-mpnet-base-v2")
|
||||
.build())
|
||||
.batchSize(16)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
90
docs/snippets/java/config/hierarchy_config.java
Normal file
90
docs/snippets/java/config/hierarchy_config.java
Normal file
@@ -0,0 +1,90 @@
|
||||
import kreuzberg.config.HierarchyConfig;
|
||||
import kreuzberg.config.PdfConfig;
|
||||
import kreuzberg.config.ExtractionConfig;
|
||||
import kreuzberg.Kreuzberg;
|
||||
|
||||
public class HierarchyConfigExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
HierarchyConfig hierarchyConfigBasic = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(6) // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
.includeBbox(true) // Include bounding box coordinates
|
||||
.ocrCoverageThreshold(null) // No OCR coverage threshold
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigBasic = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigBasic)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigBasic = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigBasic)
|
||||
.build();
|
||||
|
||||
Kreuzberg kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// var result = kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
|
||||
// Example 2: Custom kClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
HierarchyConfig hierarchyConfigMinimal = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(3) // Minimal clustering: just 3 levels
|
||||
.includeBbox(true)
|
||||
.ocrCoverageThreshold(null)
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigMinimal = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigMinimal)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigMinimal = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigMinimal)
|
||||
.build();
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
HierarchyConfig hierarchyConfigOcr = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(6)
|
||||
.includeBbox(true)
|
||||
.ocrCoverageThreshold(0.5f) // Trigger OCR if text coverage < 50%
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigOcr = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigOcr)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigOcr = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigOcr)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: boolean (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// kClusters: int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// includeBbox: boolean (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocrCoverageThreshold: Float (default: null)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
|
||||
// - null means no OCR coverage-based triggering
|
||||
27
docs/snippets/java/config/html_output.md
Normal file
27
docs/snippets/java/config/html_output.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.HtmlOutputConfig;
|
||||
import dev.kreuzberg.HtmlTheme;
|
||||
import dev.kreuzberg.OutputFormat;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class HtmlOutput {
|
||||
public static void main(String[] args) throws Exception {
|
||||
HtmlOutputConfig htmlOutput = HtmlOutputConfig.builder()
|
||||
.withTheme(HtmlTheme.GitHub)
|
||||
.withEmbedCss(true)
|
||||
.build();
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withOutputFormat(OutputFormat.Html)
|
||||
.withHtmlOutput(Optional.of(htmlOutput))
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
System.out.println(result.content()); // HTML with kb-* classes
|
||||
}
|
||||
}
|
||||
```
|
||||
75
docs/snippets/java/config/keyword_config.java
Normal file
75
docs/snippets/java/config/keyword_config.java
Normal file
@@ -0,0 +1,75 @@
|
||||
import com.kreuzberg.Kreuzberg;
|
||||
import com.kreuzberg.config.ExtractionConfig;
|
||||
import com.kreuzberg.config.KeywordConfig;
|
||||
import com.kreuzberg.keywords.YakeParams;
|
||||
import com.kreuzberg.keywords.RakeParams;
|
||||
import com.kreuzberg.result.ExtractionResult;
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
public class KeywordConfigExample {
|
||||
|
||||
public static void basicYake() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("yake")
|
||||
.maxKeywords(10)
|
||||
.minScore(0.0f)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.yakeParams(null)
|
||||
.rakeParams(null)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
public static void advancedYake() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("yake")
|
||||
.maxKeywords(15)
|
||||
.minScore(0.1f)
|
||||
.ngramRange(1, 2)
|
||||
.language("en")
|
||||
.yakeParams(new YakeParams.Builder()
|
||||
.windowSize(1)
|
||||
.build())
|
||||
.rakeParams(null)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
public static void rakeConfig() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("rake")
|
||||
.maxKeywords(10)
|
||||
.minScore(5.0f)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.yakeParams(null)
|
||||
.rakeParams(new RakeParams.Builder()
|
||||
.minWordLength(1)
|
||||
.maxWordsPerPhrase(3)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
basicYake();
|
||||
}
|
||||
}
|
||||
4
docs/snippets/java/config/keyword_extraction_config.md
Normal file
4
docs/snippets/java/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,4 @@
|
||||
```java title="Java"
|
||||
// Note: Keyword extraction is not yet available in Java bindings
|
||||
// This feature requires the 'keywords' feature flag and is planned for a future release
|
||||
```
|
||||
11
docs/snippets/java/config/language_detection_config.md
Normal file
11
docs/snippets/java/config/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.8)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
18
docs/snippets/java/config/ocr_dpi_config.md
Normal file
18
docs/snippets/java/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.ImagePreprocessingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.build())
|
||||
.imagePreprocessing(ImagePreprocessingConfig.builder()
|
||||
.targetDpi(300)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
```
|
||||
15
docs/snippets/java/config/pdf_config.md
Normal file
15
docs/snippets/java/config/pdf_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PdfConfig;
|
||||
import dev.kreuzberg.HierarchyConfig;
|
||||
import java.util.Arrays;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.pdfOptions(PdfConfig.builder()
|
||||
.extractImages(true)
|
||||
.extractMetadata(true)
|
||||
.passwords(Arrays.asList("password1", "password2"))
|
||||
.hierarchyConfig(HierarchyConfig.builder().build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
17
docs/snippets/java/config/pdf_hierarchy_config.md
Normal file
17
docs/snippets/java/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PdfConfig;
|
||||
import dev.kreuzberg.HierarchyConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.pdfOptions(PdfConfig.builder()
|
||||
.hierarchyConfig(HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.detectionThreshold(0.75)
|
||||
.ocrCoverageThreshold(0.8)
|
||||
.minLevel(1)
|
||||
.maxLevel(5)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
13
docs/snippets/java/config/postprocessor_config.md
Normal file
13
docs/snippets/java/config/postprocessor_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PostProcessorConfig;
|
||||
import java.util.Arrays;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.postprocessor(PostProcessorConfig.builder()
|
||||
.enabled(true)
|
||||
.enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
|
||||
.disabledProcessors(Arrays.asList("mojibake_fix"))
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
7
docs/snippets/java/config/quality_processing_config.md
Normal file
7
docs/snippets/java/config/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true) // Default
|
||||
.build();
|
||||
```
|
||||
18
docs/snippets/java/config/tesseract_config.md
Normal file
18
docs/snippets/java/config/tesseract_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.language("eng+fra+deu")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(6)
|
||||
.oem(1)
|
||||
.minConfidence(0.8)
|
||||
.tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
|
||||
.enableTableDetection(true)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
11
docs/snippets/java/config/token_reduction_config.md
Normal file
11
docs/snippets/java/config/token_reduction_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveImportantWords(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
86
docs/snippets/java/docker/Usage.java
Normal file
86
docs/snippets/java/docker/Usage.java
Normal file
@@ -0,0 +1,86 @@
|
||||
```java title="Usage.java"
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.UUID;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
public final class Usage {
|
||||
private static final String BOUNDARY = "----WebKitFormBoundary" + UUID.randomUUID();
|
||||
private final String containerName;
|
||||
private final int apiPort;
|
||||
|
||||
public Usage(String containerName, int apiPort) {
|
||||
this.containerName = containerName;
|
||||
this.apiPort = apiPort;
|
||||
}
|
||||
|
||||
public void startContainer(String image) throws IOException, InterruptedException {
|
||||
System.out.println("Starting Kreuzberg Docker container...");
|
||||
ProcessBuilder pb = new ProcessBuilder("docker", "run", "-d",
|
||||
"--name", containerName,
|
||||
"-p", apiPort + ":8000",
|
||||
image);
|
||||
Process process = pb.start();
|
||||
if (process.waitFor() != 0) {
|
||||
throw new RuntimeException("Failed to start container");
|
||||
}
|
||||
System.out.println("Container started on http://localhost:" + apiPort);
|
||||
}
|
||||
|
||||
public String extractFile(String filePath) throws IOException {
|
||||
byte[] fileBytes = Files.readAllBytes(Paths.get(filePath));
|
||||
String fileName = Paths.get(filePath).getFileName().toString();
|
||||
|
||||
URL url = new URL("http://localhost:" + apiPort + "/api/extract");
|
||||
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||
conn.setRequestMethod("POST");
|
||||
conn.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + BOUNDARY);
|
||||
conn.setDoOutput(true);
|
||||
|
||||
try (OutputStream os = conn.getOutputStream()) {
|
||||
os.write(("--" + BOUNDARY + "\r\n").getBytes());
|
||||
os.write(("Content-Disposition: form-data; name=\"file\"; filename=\"" + fileName + "\"\r\n").getBytes());
|
||||
os.write("Content-Type: application/octet-stream\r\n\r\n".getBytes());
|
||||
os.write(fileBytes);
|
||||
os.write(("\r\n--" + BOUNDARY + "--\r\n").getBytes());
|
||||
}
|
||||
|
||||
StringBuilder response = new StringBuilder();
|
||||
try (BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()))) {
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
response.append(line);
|
||||
}
|
||||
}
|
||||
|
||||
return JsonParser.parseString(response.toString())
|
||||
.getAsJsonObject()
|
||||
.get("content")
|
||||
.getAsString();
|
||||
}
|
||||
|
||||
public void stopContainer() throws IOException, InterruptedException {
|
||||
System.out.println("Stopping Kreuzberg Docker container...");
|
||||
new ProcessBuilder("docker", "stop", containerName).start().waitFor();
|
||||
new ProcessBuilder("docker", "rm", containerName).start().waitFor();
|
||||
System.out.println("Container stopped and removed");
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
Usage docker = new Usage("kreuzberg-api", 8000);
|
||||
|
||||
try {
|
||||
docker.startContainer("kreuzberg:latest");
|
||||
Thread.sleep(2000);
|
||||
|
||||
String content = docker.extractFile("document.pdf");
|
||||
System.out.println("Extracted content:\n" + content);
|
||||
} finally {
|
||||
docker.stopContainer();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/java/getting-started/basic_usage.md
Normal file
25
docs/snippets/java/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
public class BasicUsage {
|
||||
public static void main(String[] args) throws IOException {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
|
||||
System.out.println("Content:");
|
||||
System.out.println(result.getContent());
|
||||
|
||||
System.out.println("\nMetadata:");
|
||||
Map<String, Object> metadata = result.getMetadata();
|
||||
if (metadata != null) {
|
||||
System.out.println("Title: " + metadata.get("title"));
|
||||
System.out.println("Author: " + metadata.get("author"));
|
||||
}
|
||||
|
||||
System.out.println("\nTables found: " + result.getTables().size());
|
||||
System.out.println("Images found: " + result.getImages().size());
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/java/getting-started/extract_file.md
Normal file
21
docs/snippets/java/getting-started/extract_file.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class ExtractFile {
|
||||
public static void main(String[] args) throws IOException {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("contract.pdf", config);
|
||||
|
||||
System.out.println("Extracted " + result.getContent().length() + " characters");
|
||||
System.out.println("Quality score: " + result.getQualityScore());
|
||||
System.out.println("Processing time: " + result.getMetadata().get("processing_time") + "ms");
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/java/getting-started/extract_with_ocr.md
Normal file
26
docs/snippets/java/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class ExtractWithOCR {
|
||||
public static void main(String[] args) throws IOException {
|
||||
OcrConfig ocrConfig = OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng")
|
||||
.build();
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(ocrConfig)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
|
||||
System.out.println("Extracted text from scanned document:");
|
||||
System.out.println(result.getContent());
|
||||
System.out.println("Used OCR backend: tesseract");
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/java/getting-started/hello_world.md
Normal file
13
docs/snippets/java/getting-started/hello_world.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import java.io.IOException;
|
||||
|
||||
public class HelloWorld {
|
||||
public static void main(String[] args) throws IOException {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
System.out.println("Extracted content:");
|
||||
System.out.println(result.getContent().substring(0, Math.min(200, result.getContent().length())));
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/java/getting-started/install_verify.md
Normal file
15
docs/snippets/java/getting-started/install_verify.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import java.io.IOException;
|
||||
|
||||
public class InstallVerify {
|
||||
public static void main(String[] args) throws IOException {
|
||||
System.out.println("Kreuzberg FFI bindings loaded successfully");
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("sample.pdf");
|
||||
System.out.println("Installation verified!");
|
||||
System.out.println("Extracted " + result.getContent().length() + " characters");
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/java/getting-started/read_content.md
Normal file
24
docs/snippets/java/getting-started/read_content.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
public class ReadContent {
|
||||
public static void main(String[] args) throws IOException {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
|
||||
String content = result.getContent();
|
||||
var tables = result.getTables();
|
||||
var images = result.getImages();
|
||||
Map<String, Object> metadata = result.getMetadata();
|
||||
|
||||
System.out.println("Content: " + content.length() + " characters");
|
||||
System.out.println("Tables: " + tables.size());
|
||||
System.out.println("Images: " + images.size());
|
||||
if (metadata != null) {
|
||||
System.out.println("Metadata keys: " + metadata.keySet());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
46
docs/snippets/java/llm/structured_extraction.md
Normal file
46
docs/snippets/java/llm/structured_extraction.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.LlmConfig;
|
||||
import dev.kreuzberg.StructuredExtractionConfig;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class StructuredExtractionExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
Map<String, Object> schema = Map.of(
|
||||
"type", "object",
|
||||
"properties", Map.of(
|
||||
"title", Map.of("type", "string"),
|
||||
"authors", Map.of("type", "array", "items", Map.of("type", "string")),
|
||||
"date", Map.of("type", "string")
|
||||
),
|
||||
"required", List.of("title", "authors", "date"),
|
||||
"additionalProperties", false
|
||||
);
|
||||
|
||||
LlmConfig llm = LlmConfig.builder()
|
||||
.withModel("openai/gpt-4o-mini")
|
||||
.build();
|
||||
|
||||
StructuredExtractionConfig structured = new StructuredExtractionConfig(
|
||||
schema,
|
||||
"PaperMetadata",
|
||||
null,
|
||||
true,
|
||||
null,
|
||||
llm
|
||||
);
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withStructuredExtraction(java.util.Optional.of(structured))
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile(Path.of("paper.pdf"), config);
|
||||
System.out.println(result.structuredOutput());
|
||||
}
|
||||
}
|
||||
```
|
||||
59
docs/snippets/java/mcp/mcp_client.md
Normal file
59
docs/snippets/java/mcp/mcp_client.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```java title="Java"
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Map;
|
||||
|
||||
public class McpClient {
|
||||
private final Process mcpProcess;
|
||||
private final BufferedWriter stdin;
|
||||
private final BufferedReader stdout;
|
||||
private final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
public McpClient() throws IOException {
|
||||
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
|
||||
mcpProcess = pb.start();
|
||||
stdin = new BufferedWriter(new OutputStreamWriter(mcpProcess.getOutputStream()));
|
||||
stdout = new BufferedReader(new InputStreamReader(mcpProcess.getInputStream()));
|
||||
}
|
||||
|
||||
public String extractFile(String path) throws IOException {
|
||||
Map<String, Object> request = Map.of(
|
||||
"method", "tools/call",
|
||||
"params", Map.of(
|
||||
"name", "extract_file",
|
||||
"arguments", Map.of("path", path, "async", true)
|
||||
)
|
||||
);
|
||||
|
||||
stdin.write(mapper.writeValueAsString(request));
|
||||
stdin.newLine();
|
||||
stdin.flush();
|
||||
|
||||
String response = stdout.readLine();
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, Object> result = mapper.readValue(response, Map.class);
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, Object> resultData = (Map<String, Object>) result.get("result");
|
||||
return (String) resultData.get("content");
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
stdin.close();
|
||||
stdout.close();
|
||||
mcpProcess.destroy();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
try (McpClient client = new McpClient()) {
|
||||
String content = client.extractFile("contract.pdf");
|
||||
System.out.println("Extracted content: " + content);
|
||||
} catch (IOException e) {
|
||||
System.err.println("Error: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
40
docs/snippets/java/mcp/mcp_custom_client.md
Normal file
40
docs/snippets/java/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```java title="Java"
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Map;
|
||||
|
||||
public class McpCustomClient {
|
||||
public static void main(String[] args) throws IOException, InterruptedException {
|
||||
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
|
||||
Process mcp = pb.start();
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
try (BufferedWriter stdin = new BufferedWriter(new OutputStreamWriter(mcp.getOutputStream()));
|
||||
BufferedReader stdout = new BufferedReader(new InputStreamReader(mcp.getInputStream()))) {
|
||||
|
||||
Map<String, Object> request = Map.of(
|
||||
"method", "tools/call",
|
||||
"params", Map.of(
|
||||
"name", "extract_file",
|
||||
"arguments", Map.of("path", "document.pdf", "async", true)
|
||||
)
|
||||
);
|
||||
|
||||
stdin.write(mapper.writeValueAsString(request));
|
||||
stdin.newLine();
|
||||
stdin.flush();
|
||||
|
||||
String line = stdout.readLine();
|
||||
if (line != null) {
|
||||
System.out.println(line);
|
||||
}
|
||||
}
|
||||
|
||||
mcp.waitFor();
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/java/mcp/mcp_server_start.md
Normal file
17
docs/snippets/java/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import java.io.IOException;
|
||||
|
||||
public class McpServer {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
// Start MCP server using CLI
|
||||
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
|
||||
pb.inheritIO();
|
||||
Process process = pb.start();
|
||||
process.waitFor();
|
||||
} catch (IOException | InterruptedException e) {
|
||||
System.err.println("Failed to start MCP server: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/java/metadata/PageBoundaries.md
Normal file
25
docs/snippets/java/metadata/PageBoundaries.md
Normal file
@@ -0,0 +1,25 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
If (result.metadata().pages() != null &&
|
||||
result.metadata().pages().boundaries() != null) {
|
||||
|
||||
var contentBytes = result.content().getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
for (var boundary : result.metadata().pages().boundaries().subList(0, 3)) {
|
||||
var pageBytes = Arrays.copyOfRange(
|
||||
contentBytes,
|
||||
boundary.byteStart(),
|
||||
boundary.byteEnd()
|
||||
);
|
||||
var pageText = new String(pageBytes, StandardCharsets.UTF_8);
|
||||
|
||||
System.out.println("Page " + boundary.pageNumber() + ":");
|
||||
System.out.println(" Byte range: " + boundary.byteStart() +
|
||||
"-" + boundary.byteEnd());
|
||||
System.out.println(" Preview: " + pageText.substring(0, 100) + "...");
|
||||
}
|
||||
|
||||
}
|
||||
18
docs/snippets/java/metadata/PageTrackingBasic.md
Normal file
18
docs/snippets/java/metadata/PageTrackingBasic.md
Normal file
@@ -0,0 +1,18 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
|
||||
var config = ExtractionConfig.builder()
|
||||
.pages(PageConfig.builder()
|
||||
.extractPages(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
If (result.pages() != null) {
|
||||
for (var page : result.pages()) {
|
||||
System.out.println("Page " + page.pageNumber() + ":");
|
||||
System.out.println(" Content: " + page.content().length() + " chars");
|
||||
System.out.println(" Tables: " + page.tables().size());
|
||||
System.out.println(" Images: " + page.images().size());
|
||||
}
|
||||
}
|
||||
12
docs/snippets/java/metadata/language_detection.md
Normal file
12
docs/snippets/java/metadata/language_detection.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.9)
|
||||
.detectMultiple(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.8)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
|
||||
|
||||
System.out.println("Detected languages: " + result.getDetectedLanguages());
|
||||
```
|
||||
111
docs/snippets/java/metadata/metadata.md
Normal file
111
docs/snippets/java/metadata/metadata.md
Normal file
@@ -0,0 +1,111 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Metadata;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
// Metadata is flat — format-specific fields are at the top level
|
||||
Metadata metadata = result.getMetadata();
|
||||
metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
|
||||
metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));
|
||||
|
||||
// Format-specific fields are in the additional map
|
||||
Map<String, Object> extra = metadata.getAdditional();
|
||||
if (extra.get("page_count") != null) {
|
||||
System.out.println("Pages: " + extra.get("page_count"));
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
|
||||
Metadata htmlMeta = htmlResult.getMetadata();
|
||||
htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));
|
||||
|
||||
Map<String, Object> htmlExtra = htmlMeta.getAdditional();
|
||||
String description = (String) htmlExtra.get("description");
|
||||
if (description != null) {
|
||||
System.out.println("Description: " + description);
|
||||
}
|
||||
|
||||
// Access keywords as array
|
||||
htmlMeta.getKeywords().ifPresent(keywords ->
|
||||
System.out.println("Keywords: " + keywords));
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
String canonicalUrl = (String) htmlExtra.get("canonical_url");
|
||||
if (canonicalUrl != null) {
|
||||
System.out.println("Canonical URL: " + canonicalUrl);
|
||||
}
|
||||
|
||||
// Access Open Graph fields from map
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
|
||||
if (openGraph != null) {
|
||||
System.out.println("Open Graph Image: " + openGraph.get("image"));
|
||||
System.out.println("Open Graph Title: " + openGraph.get("title"));
|
||||
System.out.println("Open Graph Type: " + openGraph.get("type"));
|
||||
}
|
||||
|
||||
// Access Twitter Card fields from map
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
|
||||
if (twitterCard != null) {
|
||||
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
|
||||
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));
|
||||
|
||||
String textDirection = (String) htmlExtra.get("text_direction");
|
||||
if (textDirection != null) {
|
||||
System.out.println("Text Direction: " + textDirection);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
|
||||
if (headers != null) {
|
||||
headers.stream()
|
||||
.map(h -> h.get("text"))
|
||||
.forEach(text -> System.out.print(text + ", "));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
// Access links
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
|
||||
if (links != null) {
|
||||
for (Map<String, Object> link : links) {
|
||||
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
|
||||
if (images != null) {
|
||||
for (Map<String, Object> image : images) {
|
||||
System.out.println("Image: " + image.get("src"));
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
|
||||
if (structuredData != null) {
|
||||
System.out.println("Structured data items: " + structuredData.size());
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/java/metadata/tables.md
Normal file
27
docs/snippets/java/metadata/tables.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.Table;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
|
||||
for (Table table : result.getTables()) {
|
||||
System.out.println("Table with " + table.cells().size() + " rows");
|
||||
System.out.println(table.markdown());
|
||||
|
||||
for (List<String> row : table.cells()) {
|
||||
System.out.println(row);
|
||||
}
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/metadata/vector_database_integration.md
Normal file
18
docs/snippets/java/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding("balanced")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
System.out.println("Extracted content: " + result.getContent().length() + " characters");
|
||||
```
|
||||
60
docs/snippets/java/ocr/cloud_ocr_backend.md
Normal file
60
docs/snippets/java/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.*;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.net.http.*;
|
||||
import java.net.URI;
|
||||
|
||||
public class CloudOcrExample {
|
||||
public static void main(String[] args) {
|
||||
Arena callbackArena = Arena.ofAuto();
|
||||
String apiKey = "your-api-key";
|
||||
|
||||
OcrBackend cloudOcr = (imageBytes, imageLength, configJson) -> {
|
||||
try {
|
||||
// Read image bytes from native memory
|
||||
byte[] image = imageBytes.reinterpret(imageLength)
|
||||
.toArray(ValueLayout.JAVA_BYTE);
|
||||
|
||||
// Read config JSON
|
||||
String config = configJson.reinterpret(Long.MAX_VALUE)
|
||||
.getString(0);
|
||||
|
||||
// Call cloud OCR API
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
HttpRequest request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("https://api.example.com/ocr"))
|
||||
.header("Authorization", "Bearer " + apiKey)
|
||||
.POST(HttpRequest.BodyPublishers.ofByteArray(image))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request,
|
||||
HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
String text = parseTextFromResponse(response.body());
|
||||
|
||||
// Return result as C string
|
||||
return callbackArena.allocateFrom(text);
|
||||
} catch (Exception e) {
|
||||
return MemorySegment.NULL;
|
||||
}
|
||||
};
|
||||
|
||||
try (Arena arena = Arena.ofConfined()) {
|
||||
Kreuzberg.registerOcrBackend("cloud-ocr", cloudOcr, arena);
|
||||
|
||||
// Use custom OCR backend in extraction
|
||||
// Note: Requires ExtractionConfig with OCR enabled
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("scanned.pdf");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static String parseTextFromResponse(String json) {
|
||||
// Parse JSON response and extract text field
|
||||
return json; // Simplified
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/java/ocr/image_extraction.md
Normal file
14
docs/snippets/java/ocr/image_extraction.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ImageExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.imageExtraction(ImageExtractionConfig.builder()
|
||||
.extractImages(true)
|
||||
.targetDpi(200)
|
||||
.maxImageDimension(2048)
|
||||
.injectPlaceholders(true) // set to false to extract images without markdown references
|
||||
.autoAdjustDpi(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
20
docs/snippets/java/ocr/image_preprocessing.md
Normal file
20
docs/snippets/java/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ImagePreprocessingConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.preprocessing(ImagePreprocessingConfig.builder()
|
||||
.targetDpi(300)
|
||||
.denoise(true)
|
||||
.deskew(true)
|
||||
.contrastEnhance(true)
|
||||
.binarizationMethod("otsu")
|
||||
.build())
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
1
docs/snippets/java/ocr/ocr_easyocr.md
Normal file
1
docs/snippets/java/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1 @@
|
||||
EasyOCR is only available in Python.
|
||||
38
docs/snippets/java/ocr/ocr_elements.md
Normal file
38
docs/snippets/java/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.types.OcrElement;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("paddle-ocr")
|
||||
.language("en")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
|
||||
if (result.getOcrElements() != null) {
|
||||
for (OcrElement element : result.getOcrElements()) {
|
||||
System.out.printf("Text: %s%n", element.getText());
|
||||
System.out.printf("Confidence: %.2f%n", element.getConfidence().getRecognition());
|
||||
System.out.printf("Geometry: %s%n", element.getGeometry());
|
||||
if (element.getRotation() != null) {
|
||||
System.out.printf("Rotation: %.1f°%n", element.getRotation().getAngle());
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/java/ocr/ocr_extraction.md
Normal file
26
docs/snippets/java/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
16
docs/snippets/java/ocr/ocr_force_all_pages.md
Normal file
16
docs/snippets/java/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.build())
|
||||
.forceOcr(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
```
|
||||
16
docs/snippets/java/ocr/ocr_multi_language.md
Normal file
16
docs/snippets/java/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu+fra")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
```
|
||||
27
docs/snippets/java/ocr/ocr_paddleocr.md
Normal file
27
docs/snippets/java/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("paddle-ocr")
|
||||
.language("en")
|
||||
// .paddleOcrConfig(PaddleOcrConfig.builder().modelTier("server").build()) // for max accuracy
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
4
docs/snippets/java/plugins/clear_plugins.md
Normal file
4
docs/snippets/java/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,4 @@
|
||||
```java title="Java"
|
||||
// Java does not provide bulk clearing functionality in v4.0.0
|
||||
// Unregister plugins individually using unregisterPostProcessor() and unregisterValidator()
|
||||
```
|
||||
79
docs/snippets/java/plugins/embedding_backend.md
Normal file
79
docs/snippets/java/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.EmbeddingBackendBridge;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import dev.kreuzberg.IEmbeddingBackend;
|
||||
import dev.kreuzberg.KreuzbergRsException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class EmbeddingBackendExample {
|
||||
|
||||
/**
|
||||
* Wrap an already-loaded embedder so kreuzberg can call back into it during
|
||||
* chunking and standalone embed requests.
|
||||
*/
|
||||
static final class MyEmbedder implements IEmbeddingBackend {
|
||||
@Override
|
||||
public String name() {
|
||||
return "my-embedder";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String version() {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
// Optional warm-up; runs once at registration before dimensions() is cached.
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
// Optional cleanup.
|
||||
}
|
||||
|
||||
@Override
|
||||
public long dimensions() {
|
||||
// Captured once at registration; the dispatcher uses this for shape validation.
|
||||
return 768L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<List<Float>> embed(List<String> texts) {
|
||||
// Delegate to the already-loaded host model.
|
||||
List<List<Float>> out = new ArrayList<>(texts.size());
|
||||
for (int i = 0; i < texts.size(); i++) {
|
||||
List<Float> row = new ArrayList<>(768);
|
||||
for (int j = 0; j < 768; j++) {
|
||||
row.add(0.0f);
|
||||
}
|
||||
out.add(row);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Register once at startup.
|
||||
EmbeddingBackendBridge.registerEmbeddingBackend(new MyEmbedder());
|
||||
try {
|
||||
EmbeddingConfig config = EmbeddingConfig.builder()
|
||||
.model(new EmbeddingModelType.Plugin("my-embedder"))
|
||||
// Optional: bound the wait on a hung backend (default 60s; null disables).
|
||||
.maxEmbedDurationSecs(30L)
|
||||
.build();
|
||||
|
||||
List<String> texts = List.of("Hello, world!", "Second text");
|
||||
List<List<Float>> vectors = Kreuzberg.embedTexts(texts, config);
|
||||
System.out.println("Generated " + vectors.size() + " vectors");
|
||||
} catch (KreuzbergRsException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder");
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/java/plugins/extractor_registration.md
Normal file
17
docs/snippets/java/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
|
||||
public class CustomExtractorExample {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.json");
|
||||
System.out.println("Extracted content length: " + result.getContent().length());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
4
docs/snippets/java/plugins/list_plugins.md
Normal file
4
docs/snippets/java/plugins/list_plugins.md
Normal file
@@ -0,0 +1,4 @@
|
||||
```java title="Java"
|
||||
// Java does not provide plugin listing functionality in v4.0.0
|
||||
// Plugins are registered and managed through the FFI layer
|
||||
```
|
||||
34
docs/snippets/java/plugins/min_length_validator.md
Normal file
34
docs/snippets/java/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Validator;
|
||||
import dev.kreuzberg.ValidationException;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
|
||||
public class MinLengthValidatorExample {
|
||||
public static void main(String[] args) {
|
||||
int minLength = 100;
|
||||
|
||||
Validator minLengthValidator = result -> {
|
||||
if (result.getContent().length() < minLength) {
|
||||
throw new ValidationException(
|
||||
"Content too short: " + result.getContent().length() +
|
||||
" < " + minLength
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
Kreuzberg.registerValidator("min-length", minLengthValidator, 100);
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
System.out.println("Validation passed!");
|
||||
} catch (ValidationException e) {
|
||||
System.err.println("Validation failed: " + e.getMessage());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
50
docs/snippets/java/plugins/pdf_metadata_extractor.md
Normal file
50
docs/snippets/java/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.PostProcessor;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
public class PdfMetadataExtractorExample {
|
||||
private static final Logger logger = Logger.getLogger(
|
||||
PdfMetadataExtractorExample.class.getName()
|
||||
);
|
||||
|
||||
public static void main(String[] args) {
|
||||
AtomicInteger processedCount = new AtomicInteger(0);
|
||||
|
||||
PostProcessor pdfMetadata = result -> {
|
||||
if (!result.getMimeType().equals("application/pdf")) {
|
||||
return result;
|
||||
}
|
||||
|
||||
processedCount.incrementAndGet();
|
||||
|
||||
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
|
||||
metadata.put("pdf_processed", true);
|
||||
metadata.put("processing_timestamp", System.currentTimeMillis());
|
||||
|
||||
logger.info("Processed PDF: " + processedCount.get());
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
try {
|
||||
Kreuzberg.registerPostProcessor("pdf-metadata-extractor", pdfMetadata, 50);
|
||||
|
||||
logger.info("PDF metadata extractor initialized");
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
System.out.println("PDF processed: " + result.getMetadata().get("pdf_processed"));
|
||||
|
||||
logger.info("Processed " + processedCount.get() + " PDFs");
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
16
docs/snippets/java/plugins/pdf_only_processor.md
Normal file
16
docs/snippets/java/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.PostProcessor;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
PostProcessor pdfOnly = result -> {
|
||||
if (!result.getMimeType().equals("application/pdf")) {
|
||||
return result;
|
||||
}
|
||||
|
||||
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
|
||||
metadata.put("pdf_processed", true);
|
||||
|
||||
return result;
|
||||
};
|
||||
```
|
||||
17
docs/snippets/java/plugins/plugin_extractor.md
Normal file
17
docs/snippets/java/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,17 @@
|
||||
<!-- snippet:skip reason="The Java binding generates IDocumentExtractor + DocumentExtractorBridge but the InternalDocument Java class referenced by the interface is not generated by the alef Java backend. Custom DocumentExtractor implementations cannot construct return values until the alef-generated Panama type for InternalDocument lands." -->
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.IDocumentExtractor;
|
||||
import dev.kreuzberg.DocumentExtractorBridge;
|
||||
|
||||
// Java's Panama FFM binding exposes the IDocumentExtractor interface and the
|
||||
// DocumentExtractorBridge.registerDocumentExtractor / unregisterDocumentExtractor
|
||||
// helpers, but the InternalDocument return type is referenced from the
|
||||
// interface signature without a corresponding generated Java class. Until the
|
||||
// alef Java backend emits dev.kreuzberg.InternalDocument, custom Java
|
||||
// DocumentExtractor implementations cannot return a value from extract_bytes /
|
||||
// extract_file.
|
||||
//
|
||||
// Implement the extractor in Rust as `Plugin + DocumentExtractor` and register
|
||||
// it via `register_document_extractor` in a Rust shim crate that links
|
||||
// kreuzberg before the JVM loads the native library.
|
||||
```
|
||||
22
docs/snippets/java/plugins/plugin_logging.md
Normal file
22
docs/snippets/java/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```java title="Java"
|
||||
import java.util.logging.Logger;
|
||||
import java.util.logging.Level;
|
||||
|
||||
class MyPlugin implements PostProcessor {
|
||||
private static final Logger logger = Logger.getLogger(MyPlugin.class.getName());
|
||||
|
||||
@Override
|
||||
public ExtractionResult process(ExtractionResult result) {
|
||||
logger.info("Processing " + result.mimeType() +
|
||||
" (" + result.content().length() + " bytes)");
|
||||
|
||||
// Processing...
|
||||
|
||||
if (result.content().isEmpty()) {
|
||||
logger.warning("Processing resulted in empty content");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/java/plugins/plugin_testing.md
Normal file
37
docs/snippets/java/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.PostProcessor;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class PostProcessorTest {
|
||||
@Test
|
||||
void testWordCountProcessor() {
|
||||
PostProcessor processor = result -> {
|
||||
long count = result.getContent().split("\\s+").length;
|
||||
|
||||
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
|
||||
metadata.put("word_count", count);
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
ExtractionResult input = new ExtractionResult(
|
||||
"Hello world test",
|
||||
"text/plain",
|
||||
new HashMap<>(),
|
||||
java.util.List.of(),
|
||||
java.util.List.of(),
|
||||
java.util.List.of(),
|
||||
java.util.List.of(),
|
||||
true
|
||||
);
|
||||
|
||||
ExtractionResult output = processor.process(input);
|
||||
|
||||
assertEquals(3, output.getMetadata().get("word_count"));
|
||||
}
|
||||
}
|
||||
```
|
||||
61
docs/snippets/java/plugins/plugin_validator.md
Normal file
61
docs/snippets/java/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,61 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.IValidator;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ValidatorBridge;
|
||||
|
||||
// Generic validator pattern: every IValidator has the same shape.
|
||||
// name() keys the registry, priority() orders execution (higher = earlier),
|
||||
// should_validate() is a fast skip-check, and validate() throws on failure.
|
||||
public class GenericValidator implements IValidator {
|
||||
private final String pluginName;
|
||||
private final int pluginPriority;
|
||||
|
||||
public GenericValidator(String pluginName, int pluginPriority) {
|
||||
this.pluginName = pluginName;
|
||||
this.pluginPriority = pluginPriority;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return pluginName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String version() {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
// Optional: open resources, load config files, etc.
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
// Optional: release resources held in initialize().
|
||||
}
|
||||
|
||||
@Override
|
||||
public void validate(ExtractionResult result, ExtractionConfig config) throws Exception {
|
||||
if (result.content() == null || result.content().isBlank()) {
|
||||
throw new IllegalArgumentException("Extracted content is blank");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean should_validate(ExtractionResult _result, ExtractionConfig _config) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int priority() {
|
||||
return pluginPriority;
|
||||
}
|
||||
|
||||
public static void registerGenericValidator() {
|
||||
GenericValidator validator = new GenericValidator("non-empty-content", 200);
|
||||
ValidatorBridge.registerValidator(validator);
|
||||
}
|
||||
}
|
||||
```
|
||||
11
docs/snippets/java/plugins/quality_score_validator.md
Normal file
11
docs/snippets/java/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
Validator qualityValidator = result -> {
|
||||
double score = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
|
||||
|
||||
if (score < 0.5) {
|
||||
throw new ValidationException(
|
||||
String.format("Quality score too low: %.2f < 0.50", score)
|
||||
);
|
||||
}
|
||||
};
|
||||
```
|
||||
27
docs/snippets/java/plugins/stateful_plugin.md
Normal file
27
docs/snippets/java/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
class StatefulPlugin implements PostProcessor {
|
||||
// Use atomic types for simple counters
|
||||
private final AtomicInteger callCount = new AtomicInteger(0);
|
||||
|
||||
// Use concurrent collections for complex state
|
||||
private final ConcurrentHashMap<String, String> cache = new ConcurrentHashMap<>();
|
||||
|
||||
@Override
|
||||
public ExtractionResult process(ExtractionResult result) {
|
||||
// Increment counter atomically
|
||||
callCount.incrementAndGet();
|
||||
|
||||
// Update cache (thread-safe)
|
||||
cache.put("last_mime", result.mimeType());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public int getCallCount() {
|
||||
return callCount.get();
|
||||
}
|
||||
}
|
||||
```
|
||||
11
docs/snippets/java/plugins/unregister_plugins.md
Normal file
11
docs/snippets/java/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
|
||||
try {
|
||||
// Unregister specific plugins
|
||||
Kreuzberg.unregisterPostProcessor("word-count");
|
||||
Kreuzberg.unregisterValidator("min-length");
|
||||
} catch (KreuzbergException e) {
|
||||
System.err.println("Failed to unregister: " + e.getMessage());
|
||||
}
|
||||
```
|
||||
31
docs/snippets/java/plugins/word_count_processor.md
Normal file
31
docs/snippets/java/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.PostProcessor;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class WordCountExample {
|
||||
public static void main(String[] args) {
|
||||
PostProcessor wordCount = result -> {
|
||||
long count = result.getContent().split("\\s+").length;
|
||||
|
||||
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
|
||||
metadata.put("word_count", count);
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
try {
|
||||
Kreuzberg.registerPostProcessor("word-count", wordCount, 50);
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
System.out.println("Word count: " + result.getMetadata().get("word_count"));
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/java/utils/chunking.md
Normal file
19
docs/snippets/java/utils/chunking.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1500)
|
||||
.maxOverlap(200)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.builder()
|
||||
.type("preset")
|
||||
.name("text-embedding-all-minilm-l6-v2")
|
||||
.build())
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
19
docs/snippets/java/utils/chunking_rag.md
Normal file
19
docs/snippets/java/utils/chunking_rag.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.embedding("balanced")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
|
||||
|
||||
System.out.println("Content: " + result.getContent()
|
||||
.substring(0, Math.min(100, result.getContent().length())) + "...");
|
||||
```
|
||||
12
docs/snippets/java/utils/embedding_with_chunking.md
Normal file
12
docs/snippets/java/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1024)
|
||||
.maxOverlap(100)
|
||||
.embedding("balanced")
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
4
docs/snippets/java/utils/keyword_extraction_example.md
Normal file
4
docs/snippets/java/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,4 @@
|
||||
```java title="Java"
|
||||
// Note: Keyword extraction is not yet available in Java bindings
|
||||
// This feature requires the 'keywords' feature flag and is planned for a future release
|
||||
```
|
||||
20
docs/snippets/java/utils/quality_processing_example.md
Normal file
20
docs/snippets/java/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
|
||||
|
||||
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
|
||||
|
||||
if (qualityScore < 0.5) {
|
||||
System.out.printf("Warning: Low quality extraction (%.2f)%n", qualityScore);
|
||||
System.out.println("Consider re-scanning or adjusting OCR settings");
|
||||
} else {
|
||||
System.out.printf("Quality score: %.2f%n", qualityScore);
|
||||
}
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user