Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
Import dev.kreuzberg.\*;
var config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkSize(500)
.overlap(50)
.build())
.pages(PageConfig.builder()
.extractPages(true)
.build())
.build();
var result = Kreuzberg.extractFileSync("document.pdf", config);
If (result.chunks() != null) {
for (var chunk : result.chunks()) {
if (chunk.metadata().firstPage() != null) {
var pageRange = chunk.metadata().firstPage().equals(chunk.metadata().lastPage())
? "Page " + chunk.metadata().firstPage()
: "Pages " + chunk.metadata().firstPage() + "-" + chunk.metadata().lastPage();
System.out.println("Chunk: " + chunk.text().substring(0, 50) +
"... (" + pageRange + ")");
}
}
}

View File

@@ -0,0 +1,36 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.PageConfig;
import java.nio.file.Path;
import java.util.Optional;
ExtractionConfig config = ExtractionConfig.builder()
.withChunking(Optional.of(ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.build()))
.withPages(Optional.of(PageConfig.builder()
.withExtractPages(true)
.build()))
.build();
var result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
if (result.chunks() != null) {
for (var chunk : result.chunks()) {
Long firstPage = chunk.metadata().firstPage();
Long lastPage = chunk.metadata().lastPage();
if (firstPage != null && lastPage != null) {
String pageRange = firstPage.equals(lastPage)
? "Page " + firstPage
: "Pages " + firstPage + "-" + lastPage;
String content = chunk.content();
String preview = content.substring(0, Math.min(50, content.length()));
System.out.println("Chunk: " + preview + "... (" + pageRange + ")");
}
}
}
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("all-minilm-l6-v2"))
.normalize(true)
.batchSize(32)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,35 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(500)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("all-mpnet-base-v2"))
.normalize(true)
.batchSize(16)
.build())
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
System.out.println("Found " + chunks.size() + " chunks for RAG pipeline");
for (int i = 0; i < Math.min(3, chunks.size()); i++) {
Object chunk = chunks.get(i);
System.out.println("Chunk " + i + ": " + chunk.toString().substring(0, Math.min(80, chunk.toString().length())) + "...");
}
} catch (Exception ex) {
System.err.println("RAG extraction failed: " + ex.getMessage());
}
```

View File

@@ -0,0 +1,38 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.showDownloadProgress(false)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
for (int index = 0; index < chunks.size(); index++) {
Object chunk = chunks.get(index);
String chunkId = "doc_chunk_" + index;
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
if (chunk instanceof java.util.Map) {
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
if (embedding != null) {
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
}
}
}
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KeywordConfig;
import dev.kreuzberg.KeywordAlgorithm;
ExtractionConfig config = ExtractionConfig.builder()
.keywords(KeywordConfig.builder()
.algorithm(KeywordAlgorithm.YAKE)
.maxKeywords(10)
.minScore(0.3)
.ngramRange(1, 3)
.language("en")
.build())
.build();
```

View File

@@ -0,0 +1,30 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KeywordConfig;
import dev.kreuzberg.KeywordAlgorithm;
import java.util.List;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.keywords(KeywordConfig.builder()
.algorithm(KeywordAlgorithm.YAKE)
.maxKeywords(10)
.minScore(0.3)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
if (metadata.containsKey("keywords")) {
List<Map<String, Object>> keywords = (List<Map<String, Object>>) metadata.get("keywords");
for (Map<String, Object> kw : keywords) {
String text = (String) kw.get("text");
Double score = ((Number) kw.get("score")).doubleValue();
System.out.println(text + ": " + String.format("%.3f", score));
}
}
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
import java.math.BigDecimal;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(new BigDecimal("0.8"))
.detectMultiple(false)
.build())
.build();
```

View File

@@ -0,0 +1,35 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
import java.math.BigDecimal;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(new BigDecimal("0.8"))
.detectMultiple(true)
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
List<String> languages = result.getDetectedLanguages() != null
? result.getDetectedLanguages()
: List.of();
if (!languages.isEmpty()) {
System.out.println("Detected " + languages.size() + " language(s): " + String.join(", ", languages));
} else {
System.out.println("No languages detected");
}
System.out.println("Total content: " + result.getContent().length() + " characters");
System.out.println("MIME type: " + result.getMimeType());
} catch (Exception ex) {
System.err.println("Processing failed: " + ex.getMessage());
}
```

View File

@@ -0,0 +1,7 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
```

View File

@@ -0,0 +1,21 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
if (qualityScore < 0.5) {
System.out.println(String.format("Warning: Low quality extraction (%.2f)", qualityScore));
System.out.println("Consider re-scanning with higher DPI or adjusting OCR settings");
} else {
System.out.println(String.format("Quality score: %.2f", qualityScore));
}
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveMarkdown(true)
.preserveCode(true)
.languageHint("eng")
.build())
.build();
```

View File

@@ -0,0 +1,33 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveMarkdown(true)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("verbose_document.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
int original = metadata.containsKey("original_token_count")
? ((Number) metadata.get("original_token_count")).intValue()
: 0;
int reduced = metadata.containsKey("token_count")
? ((Number) metadata.get("token_count")).intValue()
: 0;
double ratio = metadata.containsKey("token_reduction_ratio")
? ((Number) metadata.get("token_reduction_ratio")).doubleValue()
: 0.0;
System.out.println("Reduced from " + original + " to " + reduced + " tokens");
System.out.println(String.format("Reduction: %.1f%%", ratio * 100));
```

View File

@@ -0,0 +1,67 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class VectorDatabaseIntegration {
public static class VectorRecord {
public String id;
public float[] embedding;
public String content;
public Map<String, String> metadata;
}
public static List<VectorRecord> extractAndVectorize(String documentPath, String documentId) throws Exception {
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile(documentPath, config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
List<VectorRecord> vectorRecords = new java.util.ArrayList<>();
for (int index = 0; index < chunks.size(); index++) {
VectorRecord record = new VectorRecord();
record.id = documentId + "_chunk_" + index;
record.metadata = new HashMap<>();
record.metadata.put("document_id", documentId);
record.metadata.put("chunk_index", String.valueOf(index));
if (chunk instanceof java.util.Map) {
Map<String, Object> chunkMap = (Map<String, Object>) chunks.get(index);
record.content = (String) chunkMap.get("content");
record.embedding = (float[]) chunkMap.get("embedding");
record.metadata.put("content_length", String.valueOf(record.content.length()));
}
vectorRecords.add(record);
}
storeInVectorDatabase(vectorRecords);
return vectorRecords;
}
private static void storeInVectorDatabase(List<VectorRecord> records) {
for (VectorRecord record : records) {
if (record.embedding != null && record.embedding.length > 0) {
System.out.println("Storing " + record.id + ": " + record.content.length()
+ " chars, " + record.embedding.length + " dims");
}
}
}
}
```

View File

@@ -0,0 +1,22 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchBytesItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;
byte[] doc1 = Files.readAllBytes(Paths.get("doc1.pdf"));
byte[] doc2 = Files.readAllBytes(Paths.get("doc2.docx"));
List<BatchBytesItem> items = Arrays.asList(
new BatchBytesItem(doc1, "application/pdf", null),
new BatchBytesItem(doc2, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", null)
);
ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractBytesSync(items, config);
System.out.println("Processed " + results.size() + " documents");
```

View File

@@ -0,0 +1,22 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchFileItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;
List<BatchFileItem> items = Arrays.asList(
new BatchFileItem(Paths.get("doc1.pdf"), null),
new BatchFileItem(Paths.get("doc2.docx"), null),
new BatchFileItem(Paths.get("doc3.pptx"), null)
);
ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractFilesSync(items, config);
for (ExtractionResult result : results) {
System.out.println("Content length: " + result.content().length());
}
```

View File

@@ -0,0 +1,30 @@
<!-- snippet:skip -->
```java title="Java"
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.URI;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.List;
record ChunkRequest(String text, @JsonProperty("chunker_type") String chunkerType, ChunkConfig config) {}
record ChunkConfig(@JsonProperty("max_characters") int maxCharacters, int overlap, boolean trim) {}
record ChunkItem(String content, @JsonProperty("byte_start") int byteStart, @JsonProperty("chunk_index") int chunkIndex) {}
HttpClient client = HttpClient.newHttpClient();
ObjectMapper mapper = new ObjectMapper();
ChunkRequest req = new ChunkRequest("Your long text here...", "text", new ChunkConfig(1000, 50, true));
String json = mapper.writeValueAsString(req);
var request = HttpRequest.newBuilder()
.uri(URI.create("http://localhost:8000/chunk"))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(json))
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
var result = mapper.readTree(response.body());
System.out.println("Created " + result.get("chunk_count").asInt() + " chunks");
```

View File

@@ -0,0 +1,22 @@
```java title="Java"
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Paths;
HttpClient client = HttpClient.newHttpClient();
try (var fileStream = Files.newInputStream(Paths.get("document.pdf"))) {
byte[] content = fileStream.readAllBytes();
var request = HttpRequest.newBuilder()
.uri(URI.create("http://localhost:8000/extract"))
.header("Content-Type", "application/octet-stream")
.POST(HttpRequest.BodyPublishers.ofByteArray(content))
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
System.out.println(response.body());
}
```

View File

@@ -0,0 +1,28 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.*;
import java.nio.file.Paths;
import java.util.Optional;
ExtractionConfig config = ExtractionConfig.builder()
.withOcr(Optional.of(OcrConfig.builder()
.withBackend("tesseract")
.withLanguages(Optional.of(java.util.List.of("eng", "deu")))
.build()))
.withChunking(Optional.of(ChunkingConfig.builder()
.withMaxChars(Optional.of(512L))
.withMaxOverlap(Optional.of(50L))
.build()))
.withEnableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);
System.out.println("Content: " + result.content().substring(0, 100) + "...");
if (result.tables() != null) {
System.out.println("Tables: " + result.tables().size());
}
if (result.qualityScore() != null) {
System.out.println("Quality: " + result.qualityScore());
}
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KreuzbergRsException;
import java.nio.file.Paths;
try {
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("missing.pdf"), config);
System.out.println(result.content());
} catch (KreuzbergRsException e) {
System.err.println("Extraction failed: " + e.getMessage());
System.err.println("Error code: " + e.getCode());
}
```

View File

@@ -0,0 +1,28 @@
```java title="Java"
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Paths;
import com.fasterxml.jackson.databind.ObjectMapper;
HttpClient client = HttpClient.newHttpClient();
byte[] fileBytes = Files.readAllBytes(Paths.get("document.pdf"));
var request = HttpRequest.newBuilder()
.uri(URI.create("http://localhost:8000/extract"))
.header("Content-Type", "application/octet-stream")
.POST(HttpRequest.BodyPublishers.ofByteArray(fileBytes))
.build();
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
ObjectMapper mapper = new ObjectMapper();
var error = mapper.readTree(response.body());
System.err.println("Error: " + error.get("error_type").asText() + " - " + error.get("message").asText());
} else {
System.out.println("Success: " + response.body());
}
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractBytes(data, "application/pdf", config);
System.out.println(result.content());
System.out.println(result.mimeType());
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractBytesSync(data, "application/pdf", config);
System.out.println(result.content());
System.out.println(result.mimeType());
```

View File

@@ -0,0 +1,12 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFile(Paths.get("document.pdf"), config);
System.out.println(result.content());
System.out.println(result.mimeType());
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);
System.out.println(result.content());
System.out.println("Tables: " + (result.tables() != null ? result.tables().size() : 0));
System.out.println("Metadata: " + result.metadata());
```

View File

@@ -0,0 +1,62 @@
```java title="SimpleBenchmark.java"
import com.kreuzberg.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ForkJoinPool;
public final class SimpleBenchmark {
private SimpleBenchmark() {}
public static void main(String[] args) throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.useCache(false)
.build();
Kreuzberg kreuzberg = new Kreuzberg(config);
String filePath = "document.pdf";
int numRuns = 10;
System.out.println("Sync extraction (" + numRuns + " runs):");
long start = System.nanoTime();
for (int i = 0; i < numRuns; i++) {
kreuzberg.extractFile(filePath);
}
double syncDuration = (System.nanoTime() - start) / 1_000_000_000.0;
double avgSync = syncDuration / numRuns;
System.out.println(" - Total time: " + String.format("%.3f", syncDuration) + "s");
System.out.println(" - Average: " + String.format("%.3f", avgSync) + "s per extraction");
System.out.println("\nAsync extraction (" + numRuns + " parallel runs):");
List<Callable<ExtractionResult>> tasks = new ArrayList<>();
for (int i = 0; i < numRuns; i++) {
tasks.add(() -> kreuzberg.extractFile(filePath));
}
start = System.nanoTime();
ForkJoinPool.commonPool().invokeAll(tasks);
double asyncDuration = (System.nanoTime() - start) / 1_000_000_000.0;
System.out.println(" - Total time: " + String.format("%.3f", asyncDuration) + "s");
System.out.println(" - Average: " + String.format("%.3f", asyncDuration / numRuns) + "s per extraction");
System.out.println(" - Speedup: " + String.format("%.1f", syncDuration / asyncDuration) + "x");
ExtractionConfig cacheConfig = new ExtractionConfig.Builder()
.useCache(true)
.build();
Kreuzberg kreuzbergCached = new Kreuzberg(cacheConfig);
System.out.println("\nFirst extraction (populates cache)...");
start = System.nanoTime();
kreuzbergCached.extractFile(filePath);
double firstDuration = (System.nanoTime() - start) / 1_000_000_000.0;
System.out.println(" - Time: " + String.format("%.3f", firstDuration) + "s");
System.out.println("Second extraction (from cache)...");
start = System.nanoTime();
kreuzbergCached.extractFile(filePath);
double cachedDuration = (System.nanoTime() - start) / 1_000_000_000.0;
System.out.println(" - Time: " + String.format("%.3f", cachedDuration) + "s");
System.out.println(" - Cache speedup: " + String.format("%.1f", firstDuration / cachedDuration) + "x");
}
}
```

46
docs/snippets/java/cache/DiskCache.java vendored Normal file
View File

@@ -0,0 +1,46 @@
```java title="DiskCache.java"
import com.kreuzberg.*;
import java.nio.file.Files;
import java.nio.file.Paths;
public final class DiskCache {
private DiskCache() {}
public static void main(String[] args) throws Exception {
String cacheDir = System.getProperty("user.home") + "/.cache/kreuzberg";
Files.createDirectories(Paths.get(cacheDir));
CacheConfig cacheConfig = new CacheConfig(
cacheDir,
500L * 1024 * 1024,
7L * 86400,
true
);
ExtractionConfig config = new ExtractionConfig.Builder()
.useCache(true)
.cacheConfig(cacheConfig)
.build();
Kreuzberg kreuzberg = new Kreuzberg(config);
System.out.println("First extraction (will be cached)...");
ExtractionResult result1 = kreuzberg.extractFile("document.pdf");
System.out.println(" - Content length: " + result1.content().length());
System.out.println(" - Cached: " + result1.metadata().wasCached());
System.out.println("\nSecond extraction (from cache)...");
ExtractionResult result2 = kreuzberg.extractFile("document.pdf");
System.out.println(" - Content length: " + result2.content().length());
System.out.println(" - Cached: " + result2.metadata().wasCached());
System.out.println("\nResults are identical: " + result1.content().equals(result2.content()));
CacheStats cacheStats = kreuzberg.getCacheStats();
System.out.println("\nCache Statistics:");
System.out.println(" - Total entries: " + cacheStats.totalEntries());
System.out.println(" - Cache size: " + String.format("%.1f", cacheStats.cacheSizeBytes() / 1024.0 / 1024.0) + " MB");
System.out.println(" - Hit rate: " + String.format("%.1f", cacheStats.hitRate() * 100) + "%");
}
}
```

View File

@@ -0,0 +1,41 @@
```java title="BasicCli.java"
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public final class BasicCli {
private BasicCli() {}
public static String extractWithCli(String filePath, String outputFormat) throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "extract", filePath, "--format", outputFormat);
pb.redirectErrorStream(true);
Process process = pb.start();
StringBuilder output = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
output.append(line).append("\n");
}
}
int exitCode = process.waitFor();
if (exitCode != 0) {
throw new RuntimeException("CLI exited with code " + exitCode + ": " + output);
}
return output.toString().trim();
}
public static void main(String[] args) throws IOException, InterruptedException {
String document = "document.pdf";
String textOutput = extractWithCli(document, "text");
System.out.println("Extracted: " + textOutput.length() + " characters");
String jsonOutput = extractWithCli(document, "json");
System.out.println("JSON output received: " + jsonOutput.length() + " bytes");
}
}
```

View File

@@ -0,0 +1,56 @@
```java title="CliWithConfig.java"
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public final class CliWithConfig {
private static final ObjectMapper MAPPER = new ObjectMapper();
private CliWithConfig() {}
public static JsonNode extractWithConfig(String filePath, String configPath)
throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder(
"kreuzberg",
"extract",
filePath,
"--config",
configPath,
"--format",
"json");
pb.redirectErrorStream(true);
Process process = pb.start();
StringBuilder output = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
output.append(line);
}
}
int exitCode = process.waitFor();
if (exitCode != 0) {
throw new RuntimeException("CLI exited with code " + exitCode + ": " + output);
}
return MAPPER.readTree(output.toString());
}
public static void main(String[] args) throws IOException, InterruptedException {
String configFile = "kreuzberg.toml";
String document = "document.pdf";
System.out.println("Extracting " + document + " with config " + configFile);
JsonNode result = extractWithConfig(document, configFile);
String content = result.get("content").asText();
System.out.println("Content length: " + content.length());
System.out.println("Format: " + result.get("format").asText());
System.out.println("Languages: " + result.get("languages").toString());
}
}
```

View File

@@ -0,0 +1,52 @@
```java title="Element-Based Output (Java)"
import io.kreuzberg.Kreuzberg;
import io.kreuzberg.ExtractionConfig;
import io.kreuzberg.ExtractionResult;
import io.kreuzberg.Element;
import io.kreuzberg.OutputFormat;
public class ElementBasedOutput {
public static void main(String[] args) {
// Configure element-based output
ExtractionConfig config = new ExtractionConfig();
config.setOutputFormat(OutputFormat.ELEMENT_BASED);
// Extract document
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
// Access elements
for (Element element : result.getElements()) {
System.out.println("Type: " + element.getElementType());
String text = element.getText();
if (text.length() > 100) {
text = text.substring(0, 100);
}
System.out.println("Text: " + text);
if (element.getMetadata().getPageNumber() != null) {
System.out.println("Page: " + element.getMetadata().getPageNumber());
}
if (element.getMetadata().getCoordinates() != null) {
var coords = element.getMetadata().getCoordinates();
System.out.printf("Coords: (%f, %f) - (%f, %f)%n",
coords.getLeft(), coords.getTop(),
coords.getRight(), coords.getBottom());
}
System.out.println("---");
}
// Filter by element type
result.getElements().stream()
.filter(e -> "title".equals(e.getElementType()))
.forEach(title -> {
String level = (String) title.getMetadata()
.getAdditional()
.getOrDefault("level", "unknown");
System.out.printf("[%s] %s%n", level, title.getText());
});
}
}
```

View File

@@ -0,0 +1,41 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.*;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(100)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
if (!result.getDetectedLanguages().isEmpty()) {
System.out.println("Languages: " + result.getDetectedLanguages());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,60 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.build();
```
```java title="Java - Markdown with Heading Context"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.HeadingContext;
import dev.kreuzberg.HeadingLevel;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.sizingTokenizer("Xenova/gpt-4o")
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
var headingContext = chunk.getMetadata().getHeadingContext();
if (headingContext.isPresent()) {
System.out.println("Headings:");
headingContext.get().getHeadings().forEach(heading ->
System.out.println(" Level " + heading.getLevel() + ": " + heading.getText())
);
}
});
```
```java title="Java - Prepend Heading Context"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.prependHeadingContext(true)
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
// Each chunk's content is prefixed with its heading breadcrumb
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())));
});
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
```

View File

@@ -0,0 +1,8 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Path;
public final class ConfigFileExample {
public static void main(String[] args) throws Exception {
ExtractionConfig config = Kreuzberg.loadExtractionConfigFromFile(Path.of("kreuzberg.toml"));
ExtractionResult result = Kreuzberg.extractFile(Path.of("document.pdf"), config);
System.out.printf("Detected MIME: %s%n", result.getMimeType());
}
}
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.tesseractConfig(TesseractConfig.builder()
.psm(3)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,31 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
public final class ProgrammaticConfigExample {
public static void main(String[] args) throws Exception {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.tesseractConfig(TesseractConfig.builder()
.psm(6)
.build())
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.printf("Content length: %d%n", result.getContent().length());
}
}
```

View File

@@ -0,0 +1,18 @@
```java title="Document Structure Config (Java)"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
ExtractionConfig config = ExtractionConfig.builder()
.includeDocumentStructure(true)
.build();
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
if (result.getDocumentStructure().isPresent()) {
var document = result.getDocumentStructure().get();
for (var node : document.nodes()) {
System.out.println("[" + node.content().nodeType() + "]");
}
}
```

View File

@@ -0,0 +1,53 @@
```java title="Element-Based Output (Java)"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Element;
import dev.kreuzberg.ResultFormat;
import java.nio.file.Path;
import java.util.List;
public class ElementBasedOutput {
public static void main(String[] args) throws Exception {
// Configure element-based output
ExtractionConfig config = ExtractionConfig.builder()
.withResultFormat(ResultFormat.ElementBased)
.build();
// Extract document
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
// Access elements
List<Element> elements = result.elements();
if (elements != null) {
for (Element element : elements) {
System.out.println("Type: " + element.elementType());
String text = element.text();
if (text.length() > 100) {
text = text.substring(0, 100);
}
System.out.println("Text: " + text);
if (element.metadata().pageNumber() != null) {
System.out.println("Page: " + element.metadata().pageNumber());
}
if (element.metadata().coordinates() != null) {
System.out.println("Coords: " + element.metadata().coordinates());
}
System.out.println("---");
}
// Filter by element type
elements.stream()
.filter(e -> "Title".equalsIgnoreCase(String.valueOf(e.elementType())))
.forEach(title -> {
String level = title.metadata().additional().getOrDefault("level", "unknown");
System.out.printf("[%s] %s%n", level, title.text());
});
}
}
}
```

View File

@@ -0,0 +1,87 @@
import kreuzberg.config.EmbeddingConfig;
import kreuzberg.config.EmbeddingModelType;
import kreuzberg.config.ChunkingConfig;
import kreuzberg.config.ExtractionConfig;
public class EmbeddingConfigExample {
public static void main(String[] args) {
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
EmbeddingConfig embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.batchSize(32)
.normalize(true)
.showDownloadProgress(true)
.cacheDir("~/.cache/kreuzberg/embeddings")
.build();
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384))
.batchSize(32)
.normalize(true)
.showDownloadProgress(true)
.cacheDir(null) // Uses default: .kreuzberg/embeddings/
.build();
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768))
.batchSize(16) // Larger model requires smaller batch size
.normalize(true)
.showDownloadProgress(true)
.cacheDir("/var/cache/embeddings")
.build();
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
ChunkingConfig chunkingConfig = ChunkingConfig.builder()
.maxChars(1024)
.maxOverlap(100)
.preset("balanced")
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.batchSize(32)
.normalize(true)
.build())
.build();
ExtractionConfig extractionConfig = ExtractionConfig.builder()
.chunking(chunkingConfig)
.build();
}
}
// Key parameter explanations:
//
// batchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// cacheDir: Where to store downloaded models
// - null: Uses .kreuzberg/embeddings/ in current directory
// - String path: Custom directory for model storage
//
// showDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,21 @@
```java title="Java"
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("all-mpnet-base-v2")
.build())
.batchSize(16)
.normalize(true)
.showDownloadProgress(true)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,90 @@
import kreuzberg.config.HierarchyConfig;
import kreuzberg.config.PdfConfig;
import kreuzberg.config.ExtractionConfig;
import kreuzberg.Kreuzberg;
public class HierarchyConfigExample {
public static void main(String[] args) throws Exception {
// Example 1: Basic hierarchy extraction
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
HierarchyConfig hierarchyConfigBasic = HierarchyConfig.builder()
.enabled(true)
.kClusters(6) // Default: creates 6 font size clusters (H1-H6 structure)
.includeBbox(true) // Include bounding box coordinates
.ocrCoverageThreshold(null) // No OCR coverage threshold
.build();
PdfConfig pdfConfigBasic = PdfConfig.builder()
.hierarchy(hierarchyConfigBasic)
.build();
ExtractionConfig extractionConfigBasic = ExtractionConfig.builder()
.pdfOptions(pdfConfigBasic)
.build();
Kreuzberg kreuzberg = new Kreuzberg(extractionConfigBasic);
// var result = kreuzberg.extractFileSync("document.pdf");
// Example 2: Custom kClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
HierarchyConfig hierarchyConfigMinimal = HierarchyConfig.builder()
.enabled(true)
.kClusters(3) // Minimal clustering: just 3 levels
.includeBbox(true)
.ocrCoverageThreshold(null)
.build();
PdfConfig pdfConfigMinimal = PdfConfig.builder()
.hierarchy(hierarchyConfigMinimal)
.build();
ExtractionConfig extractionConfigMinimal = ExtractionConfig.builder()
.pdfOptions(pdfConfigMinimal)
.build();
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
HierarchyConfig hierarchyConfigOcr = HierarchyConfig.builder()
.enabled(true)
.kClusters(6)
.includeBbox(true)
.ocrCoverageThreshold(0.5f) // Trigger OCR if text coverage < 50%
.build();
PdfConfig pdfConfigOcr = PdfConfig.builder()
.hierarchy(hierarchyConfigOcr)
.build();
ExtractionConfig extractionConfigOcr = ExtractionConfig.builder()
.pdfOptions(pdfConfigOcr)
.build();
}
}
// Field descriptions:
//
// enabled: boolean (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// kClusters: int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// includeBbox: boolean (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocrCoverageThreshold: Float (default: null)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
// - null means no OCR coverage-based triggering

View File

@@ -0,0 +1,27 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.HtmlOutputConfig;
import dev.kreuzberg.HtmlTheme;
import dev.kreuzberg.OutputFormat;
import java.nio.file.Path;
import java.util.Optional;
public class HtmlOutput {
public static void main(String[] args) throws Exception {
HtmlOutputConfig htmlOutput = HtmlOutputConfig.builder()
.withTheme(HtmlTheme.GitHub)
.withEmbedCss(true)
.build();
ExtractionConfig config = ExtractionConfig.builder()
.withOutputFormat(OutputFormat.Html)
.withHtmlOutput(Optional.of(htmlOutput))
.build();
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
System.out.println(result.content()); // HTML with kb-* classes
}
}
```

View File

@@ -0,0 +1,75 @@
import com.kreuzberg.Kreuzberg;
import com.kreuzberg.config.ExtractionConfig;
import com.kreuzberg.config.KeywordConfig;
import com.kreuzberg.keywords.YakeParams;
import com.kreuzberg.keywords.RakeParams;
import com.kreuzberg.result.ExtractionResult;
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
public class KeywordConfigExample {
public static void basicYake() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("yake")
.maxKeywords(10)
.minScore(0.0f)
.ngramRange(1, 3)
.language("en")
.yakeParams(null)
.rakeParams(null)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
public static void advancedYake() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("yake")
.maxKeywords(15)
.minScore(0.1f)
.ngramRange(1, 2)
.language("en")
.yakeParams(new YakeParams.Builder()
.windowSize(1)
.build())
.rakeParams(null)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
public static void rakeConfig() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("rake")
.maxKeywords(10)
.minScore(5.0f)
.ngramRange(1, 3)
.language("en")
.yakeParams(null)
.rakeParams(new RakeParams.Builder()
.minWordLength(1)
.maxWordsPerPhrase(3)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
public static void main(String[] args) throws Exception {
basicYake();
}
}

View File

@@ -0,0 +1,4 @@
```java title="Java"
// Note: Keyword extraction is not yet available in Java bindings
// This feature requires the 'keywords' feature flag and is planned for a future release
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.8)
.build())
.build();
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.ImagePreprocessingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.build())
.imagePreprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PdfConfig;
import dev.kreuzberg.HierarchyConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.extractImages(true)
.extractMetadata(true)
.passwords(Arrays.asList("password1", "password2"))
.hierarchyConfig(HierarchyConfig.builder().build())
.build())
.build();
```

View File

@@ -0,0 +1,17 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PdfConfig;
import dev.kreuzberg.HierarchyConfig;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.hierarchyConfig(HierarchyConfig.builder()
.enabled(true)
.detectionThreshold(0.75)
.ocrCoverageThreshold(0.8)
.minLevel(1)
.maxLevel(5)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PostProcessorConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.postprocessor(PostProcessorConfig.builder()
.enabled(true)
.enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
.disabledProcessors(Arrays.asList("mojibake_fix"))
.build())
.build();
```

View File

@@ -0,0 +1,7 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true) // Default
.build();
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.language("eng+fra+deu")
.tesseractConfig(TesseractConfig.builder()
.psm(6)
.oem(1)
.minConfidence(0.8)
.tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
.enableTableDetection(true)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.build();
```

View File

@@ -0,0 +1,86 @@
```java title="Usage.java"
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.UUID;
import com.google.gson.JsonParser;
public final class Usage {
private static final String BOUNDARY = "----WebKitFormBoundary" + UUID.randomUUID();
private final String containerName;
private final int apiPort;
public Usage(String containerName, int apiPort) {
this.containerName = containerName;
this.apiPort = apiPort;
}
public void startContainer(String image) throws IOException, InterruptedException {
System.out.println("Starting Kreuzberg Docker container...");
ProcessBuilder pb = new ProcessBuilder("docker", "run", "-d",
"--name", containerName,
"-p", apiPort + ":8000",
image);
Process process = pb.start();
if (process.waitFor() != 0) {
throw new RuntimeException("Failed to start container");
}
System.out.println("Container started on http://localhost:" + apiPort);
}
public String extractFile(String filePath) throws IOException {
byte[] fileBytes = Files.readAllBytes(Paths.get(filePath));
String fileName = Paths.get(filePath).getFileName().toString();
URL url = new URL("http://localhost:" + apiPort + "/api/extract");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + BOUNDARY);
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
os.write(("--" + BOUNDARY + "\r\n").getBytes());
os.write(("Content-Disposition: form-data; name=\"file\"; filename=\"" + fileName + "\"\r\n").getBytes());
os.write("Content-Type: application/octet-stream\r\n\r\n".getBytes());
os.write(fileBytes);
os.write(("\r\n--" + BOUNDARY + "--\r\n").getBytes());
}
StringBuilder response = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()))) {
String line;
while ((line = reader.readLine()) != null) {
response.append(line);
}
}
return JsonParser.parseString(response.toString())
.getAsJsonObject()
.get("content")
.getAsString();
}
public void stopContainer() throws IOException, InterruptedException {
System.out.println("Stopping Kreuzberg Docker container...");
new ProcessBuilder("docker", "stop", containerName).start().waitFor();
new ProcessBuilder("docker", "rm", containerName).start().waitFor();
System.out.println("Container stopped and removed");
}
public static void main(String[] args) throws Exception {
Usage docker = new Usage("kreuzberg-api", 8000);
try {
docker.startContainer("kreuzberg:latest");
Thread.sleep(2000);
String content = docker.extractFile("document.pdf");
System.out.println("Extracted content:\n" + content);
} finally {
docker.stopContainer();
}
}
}
```

View File

@@ -0,0 +1,25 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
import java.util.Map;
public class BasicUsage {
public static void main(String[] args) throws IOException {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Content:");
System.out.println(result.getContent());
System.out.println("\nMetadata:");
Map<String, Object> metadata = result.getMetadata();
if (metadata != null) {
System.out.println("Title: " + metadata.get("title"));
System.out.println("Author: " + metadata.get("author"));
}
System.out.println("\nTables found: " + result.getTables().size());
System.out.println("Images found: " + result.getImages().size());
}
}
```

View File

@@ -0,0 +1,21 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.io.IOException;
public class ExtractFile {
public static void main(String[] args) throws IOException {
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("contract.pdf", config);
System.out.println("Extracted " + result.getContent().length() + " characters");
System.out.println("Quality score: " + result.getQualityScore());
System.out.println("Processing time: " + result.getMetadata().get("processing_time") + "ms");
}
}
```

View File

@@ -0,0 +1,26 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class ExtractWithOCR {
public static void main(String[] args) throws IOException {
OcrConfig ocrConfig = OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build();
ExtractionConfig config = ExtractionConfig.builder()
.ocr(ocrConfig)
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println("Extracted text from scanned document:");
System.out.println(result.getContent());
System.out.println("Used OCR backend: tesseract");
}
}
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
public class HelloWorld {
public static void main(String[] args) throws IOException {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Extracted content:");
System.out.println(result.getContent().substring(0, Math.min(200, result.getContent().length())));
}
}
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
public class InstallVerify {
public static void main(String[] args) throws IOException {
System.out.println("Kreuzberg FFI bindings loaded successfully");
ExtractionResult result = Kreuzberg.extractFile("sample.pdf");
System.out.println("Installation verified!");
System.out.println("Extracted " + result.getContent().length() + " characters");
}
}
```

View File

@@ -0,0 +1,24 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
import java.util.Map;
public class ReadContent {
public static void main(String[] args) throws IOException {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
String content = result.getContent();
var tables = result.getTables();
var images = result.getImages();
Map<String, Object> metadata = result.getMetadata();
System.out.println("Content: " + content.length() + " characters");
System.out.println("Tables: " + tables.size());
System.out.println("Images: " + images.size());
if (metadata != null) {
System.out.println("Metadata keys: " + metadata.keySet());
}
}
}
```

View File

@@ -0,0 +1,46 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.LlmConfig;
import dev.kreuzberg.StructuredExtractionConfig;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
public class StructuredExtractionExample {
public static void main(String[] args) throws Exception {
Map<String, Object> schema = Map.of(
"type", "object",
"properties", Map.of(
"title", Map.of("type", "string"),
"authors", Map.of("type", "array", "items", Map.of("type", "string")),
"date", Map.of("type", "string")
),
"required", List.of("title", "authors", "date"),
"additionalProperties", false
);
LlmConfig llm = LlmConfig.builder()
.withModel("openai/gpt-4o-mini")
.build();
StructuredExtractionConfig structured = new StructuredExtractionConfig(
schema,
"PaperMetadata",
null,
true,
null,
llm
);
ExtractionConfig config = ExtractionConfig.builder()
.withStructuredExtraction(java.util.Optional.of(structured))
.build();
ExtractionResult result = Kreuzberg.extractFile(Path.of("paper.pdf"), config);
System.out.println(result.structuredOutput());
}
}
```

View File

@@ -0,0 +1,59 @@
```java title="Java"
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Map;
public class McpClient {
private final Process mcpProcess;
private final BufferedWriter stdin;
private final BufferedReader stdout;
private final ObjectMapper mapper = new ObjectMapper();
public McpClient() throws IOException {
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
mcpProcess = pb.start();
stdin = new BufferedWriter(new OutputStreamWriter(mcpProcess.getOutputStream()));
stdout = new BufferedReader(new InputStreamReader(mcpProcess.getInputStream()));
}
public String extractFile(String path) throws IOException {
Map<String, Object> request = Map.of(
"method", "tools/call",
"params", Map.of(
"name", "extract_file",
"arguments", Map.of("path", path, "async", true)
)
);
stdin.write(mapper.writeValueAsString(request));
stdin.newLine();
stdin.flush();
String response = stdout.readLine();
@SuppressWarnings("unchecked")
Map<String, Object> result = mapper.readValue(response, Map.class);
@SuppressWarnings("unchecked")
Map<String, Object> resultData = (Map<String, Object>) result.get("result");
return (String) resultData.get("content");
}
public void close() throws IOException {
stdin.close();
stdout.close();
mcpProcess.destroy();
}
public static void main(String[] args) {
try (McpClient client = new McpClient()) {
String content = client.extractFile("contract.pdf");
System.out.println("Extracted content: " + content);
} catch (IOException e) {
System.err.println("Error: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,40 @@
```java title="Java"
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Map;
public class McpCustomClient {
public static void main(String[] args) throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
Process mcp = pb.start();
ObjectMapper mapper = new ObjectMapper();
try (BufferedWriter stdin = new BufferedWriter(new OutputStreamWriter(mcp.getOutputStream()));
BufferedReader stdout = new BufferedReader(new InputStreamReader(mcp.getInputStream()))) {
Map<String, Object> request = Map.of(
"method", "tools/call",
"params", Map.of(
"name", "extract_file",
"arguments", Map.of("path", "document.pdf", "async", true)
)
);
stdin.write(mapper.writeValueAsString(request));
stdin.newLine();
stdin.flush();
String line = stdout.readLine();
if (line != null) {
System.out.println(line);
}
}
mcp.waitFor();
}
}
```

View File

@@ -0,0 +1,17 @@
```java title="Java"
import java.io.IOException;
public class McpServer {
public static void main(String[] args) {
try {
// Start MCP server using CLI
ProcessBuilder pb = new ProcessBuilder("kreuzberg", "mcp");
pb.inheritIO();
Process process = pb.start();
process.waitFor();
} catch (IOException | InterruptedException e) {
System.err.println("Failed to start MCP server: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,25 @@
Import dev.kreuzberg.\*;
import java.nio.charset.StandardCharsets;
var result = Kreuzberg.extractFileSync("document.pdf");
If (result.metadata().pages() != null &&
result.metadata().pages().boundaries() != null) {
var contentBytes = result.content().getBytes(StandardCharsets.UTF_8);
for (var boundary : result.metadata().pages().boundaries().subList(0, 3)) {
var pageBytes = Arrays.copyOfRange(
contentBytes,
boundary.byteStart(),
boundary.byteEnd()
);
var pageText = new String(pageBytes, StandardCharsets.UTF_8);
System.out.println("Page " + boundary.pageNumber() + ":");
System.out.println(" Byte range: " + boundary.byteStart() +
"-" + boundary.byteEnd());
System.out.println(" Preview: " + pageText.substring(0, 100) + "...");
}
}

View File

@@ -0,0 +1,18 @@
Import dev.kreuzberg.\*;
var config = ExtractionConfig.builder()
.pages(PageConfig.builder()
.extractPages(true)
.build())
.build();
var result = Kreuzberg.extractFileSync("document.pdf", config);
If (result.pages() != null) {
for (var page : result.pages()) {
System.out.println("Page " + page.pageNumber() + ":");
System.out.println(" Content: " + page.content().length() + " chars");
System.out.println(" Tables: " + page.tables().size());
System.out.println(" Images: " + page.images().size());
}
}

View File

@@ -0,0 +1,12 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.9)
.detectMultiple(true)
.build())
.build();
```

View File

@@ -0,0 +1,17 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.8)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
System.out.println("Detected languages: " + result.getDetectedLanguages());
```

View File

@@ -0,0 +1,111 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Metadata;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
// Metadata is flat — format-specific fields are at the top level
Metadata metadata = result.getMetadata();
metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));
// Format-specific fields are in the additional map
Map<String, Object> extra = metadata.getAdditional();
if (extra.get("page_count") != null) {
System.out.println("Pages: " + extra.get("page_count"));
}
// Access HTML metadata
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
Metadata htmlMeta = htmlResult.getMetadata();
htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));
Map<String, Object> htmlExtra = htmlMeta.getAdditional();
String description = (String) htmlExtra.get("description");
if (description != null) {
System.out.println("Description: " + description);
}
// Access keywords as array
htmlMeta.getKeywords().ifPresent(keywords ->
System.out.println("Keywords: " + keywords));
// Access canonical URL (renamed from canonical)
String canonicalUrl = (String) htmlExtra.get("canonical_url");
if (canonicalUrl != null) {
System.out.println("Canonical URL: " + canonicalUrl);
}
// Access Open Graph fields from map
@SuppressWarnings("unchecked")
Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
if (openGraph != null) {
System.out.println("Open Graph Image: " + openGraph.get("image"));
System.out.println("Open Graph Title: " + openGraph.get("title"));
System.out.println("Open Graph Type: " + openGraph.get("type"));
}
// Access Twitter Card fields from map
@SuppressWarnings("unchecked")
Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
if (twitterCard != null) {
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
}
// Access new fields
htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));
String textDirection = (String) htmlExtra.get("text_direction");
if (textDirection != null) {
System.out.println("Text Direction: " + textDirection);
}
// Access headers
@SuppressWarnings("unchecked")
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
if (headers != null) {
headers.stream()
.map(h -> h.get("text"))
.forEach(text -> System.out.print(text + ", "));
System.out.println();
}
// Access links
@SuppressWarnings("unchecked")
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
if (links != null) {
for (Map<String, Object> link : links) {
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
}
}
// Access images
@SuppressWarnings("unchecked")
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
if (images != null) {
for (Map<String, Object> image : images) {
System.out.println("Image: " + image.get("src"));
}
}
// Access structured data
@SuppressWarnings("unchecked")
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
if (structuredData != null) {
System.out.println("Structured data items: " + structuredData.size());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,27 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
for (Table table : result.getTables()) {
System.out.println("Table with " + table.cells().size() + " rows");
System.out.println(table.markdown());
for (List<String> row : table.cells()) {
System.out.println(row);
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding("balanced")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Extracted content: " + result.getContent().length() + " characters");
```

View File

@@ -0,0 +1,60 @@
```java title="Java"
import dev.kreuzberg.*;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.net.http.*;
import java.net.URI;
public class CloudOcrExample {
public static void main(String[] args) {
Arena callbackArena = Arena.ofAuto();
String apiKey = "your-api-key";
OcrBackend cloudOcr = (imageBytes, imageLength, configJson) -> {
try {
// Read image bytes from native memory
byte[] image = imageBytes.reinterpret(imageLength)
.toArray(ValueLayout.JAVA_BYTE);
// Read config JSON
String config = configJson.reinterpret(Long.MAX_VALUE)
.getString(0);
// Call cloud OCR API
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.example.com/ocr"))
.header("Authorization", "Bearer " + apiKey)
.POST(HttpRequest.BodyPublishers.ofByteArray(image))
.build();
HttpResponse<String> response = client.send(request,
HttpResponse.BodyHandlers.ofString());
String text = parseTextFromResponse(response.body());
// Return result as C string
return callbackArena.allocateFrom(text);
} catch (Exception e) {
return MemorySegment.NULL;
}
};
try (Arena arena = Arena.ofConfined()) {
Kreuzberg.registerOcrBackend("cloud-ocr", cloudOcr, arena);
// Use custom OCR backend in extraction
// Note: Requires ExtractionConfig with OCR enabled
ExtractionResult result = Kreuzberg.extractFileSync("scanned.pdf");
} catch (Exception e) {
e.printStackTrace();
}
}
private static String parseTextFromResponse(String json) {
// Parse JSON response and extract text field
return json; // Simplified
}
}
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ImageExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.imageExtraction(ImageExtractionConfig.builder()
.extractImages(true)
.targetDpi(200)
.maxImageDimension(2048)
.injectPlaceholders(true) // set to false to extract images without markdown references
.autoAdjustDpi(true)
.build())
.build();
```

View File

@@ -0,0 +1,20 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ImagePreprocessingConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.tesseractConfig(TesseractConfig.builder()
.preprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.denoise(true)
.deskew(true)
.contrastEnhance(true)
.binarizationMethod("otsu")
.build())
.build())
.build())
.build();
```

View File

@@ -0,0 +1 @@
EasyOCR is only available in Python.

View File

@@ -0,0 +1,38 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.types.OcrElement;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("paddle-ocr")
.language("en")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
if (result.getOcrElements() != null) {
for (OcrElement element : result.getOcrElements()) {
System.out.printf("Text: %s%n", element.getText());
System.out.printf("Confidence: %.2f%n", element.getConfidence().getRecognition());
System.out.printf("Geometry: %s%n", element.getGeometry());
if (element.getRotation() != null) {
System.out.printf("Rotation: %.1f°%n", element.getRotation().getAngle());
}
System.out.println();
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,26 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.build())
.forceOcr(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println(result.getContent());
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu+fra")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("multilingual.pdf", config);
System.out.println(result.getContent());
```

View File

@@ -0,0 +1,27 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("paddle-ocr")
.language("en")
// .paddleOcrConfig(PaddleOcrConfig.builder().modelTier("server").build()) // for max accuracy
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,4 @@
```java title="Java"
// Java does not provide bulk clearing functionality in v4.0.0
// Unregister plugins individually using unregisterPostProcessor() and unregisterValidator()
```

View File

@@ -0,0 +1,79 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.EmbeddingBackendBridge;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.IEmbeddingBackend;
import dev.kreuzberg.KreuzbergRsException;
import java.util.ArrayList;
import java.util.List;
public class EmbeddingBackendExample {
/**
* Wrap an already-loaded embedder so kreuzberg can call back into it during
* chunking and standalone embed requests.
*/
static final class MyEmbedder implements IEmbeddingBackend {
@Override
public String name() {
return "my-embedder";
}
@Override
public String version() {
return "1.0.0";
}
@Override
public void initialize() {
// Optional warm-up; runs once at registration before dimensions() is cached.
}
@Override
public void shutdown() {
// Optional cleanup.
}
@Override
public long dimensions() {
// Captured once at registration; the dispatcher uses this for shape validation.
return 768L;
}
@Override
public List<List<Float>> embed(List<String> texts) {
// Delegate to the already-loaded host model.
List<List<Float>> out = new ArrayList<>(texts.size());
for (int i = 0; i < texts.size(); i++) {
List<Float> row = new ArrayList<>(768);
for (int j = 0; j < 768; j++) {
row.add(0.0f);
}
out.add(row);
}
return out;
}
}
public static void main(String[] args) throws Exception {
// Register once at startup.
EmbeddingBackendBridge.registerEmbeddingBackend(new MyEmbedder());
try {
EmbeddingConfig config = EmbeddingConfig.builder()
.model(new EmbeddingModelType.Plugin("my-embedder"))
// Optional: bound the wait on a hung backend (default 60s; null disables).
.maxEmbedDurationSecs(30L)
.build();
List<String> texts = List.of("Hello, world!", "Second text");
List<List<Float>> vectors = Kreuzberg.embedTexts(texts, config);
System.out.println("Generated " + vectors.size() + " vectors");
} catch (KreuzbergRsException e) {
e.printStackTrace();
} finally {
EmbeddingBackendBridge.unregisterEmbeddingBackend("my-embedder");
}
}
}
```

View File

@@ -0,0 +1,17 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
public class CustomExtractorExample {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.json");
System.out.println("Extracted content length: " + result.getContent().length());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
}
}
```

View File

@@ -0,0 +1,4 @@
```java title="Java"
// Java does not provide plugin listing functionality in v4.0.0
// Plugins are registered and managed through the FFI layer
```

View File

@@ -0,0 +1,34 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Validator;
import dev.kreuzberg.ValidationException;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
public class MinLengthValidatorExample {
public static void main(String[] args) {
int minLength = 100;
Validator minLengthValidator = result -> {
if (result.getContent().length() < minLength) {
throw new ValidationException(
"Content too short: " + result.getContent().length() +
" < " + minLength
);
}
};
try {
Kreuzberg.registerValidator("min-length", minLengthValidator, 100);
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Validation passed!");
} catch (ValidationException e) {
System.err.println("Validation failed: " + e.getMessage());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
}
}
```

View File

@@ -0,0 +1,50 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.PostProcessor;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
public class PdfMetadataExtractorExample {
private static final Logger logger = Logger.getLogger(
PdfMetadataExtractorExample.class.getName()
);
public static void main(String[] args) {
AtomicInteger processedCount = new AtomicInteger(0);
PostProcessor pdfMetadata = result -> {
if (!result.getMimeType().equals("application/pdf")) {
return result;
}
processedCount.incrementAndGet();
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
metadata.put("pdf_processed", true);
metadata.put("processing_timestamp", System.currentTimeMillis());
logger.info("Processed PDF: " + processedCount.get());
return result;
};
try {
Kreuzberg.registerPostProcessor("pdf-metadata-extractor", pdfMetadata, 50);
logger.info("PDF metadata extractor initialized");
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("PDF processed: " + result.getMetadata().get("pdf_processed"));
logger.info("Processed " + processedCount.get() + " PDFs");
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
}
}
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.PostProcessor;
import java.util.HashMap;
import java.util.Map;
PostProcessor pdfOnly = result -> {
if (!result.getMimeType().equals("application/pdf")) {
return result;
}
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
metadata.put("pdf_processed", true);
return result;
};
```

View File

@@ -0,0 +1,17 @@
<!-- snippet:skip reason="The Java binding generates IDocumentExtractor + DocumentExtractorBridge but the InternalDocument Java class referenced by the interface is not generated by the alef Java backend. Custom DocumentExtractor implementations cannot construct return values until the alef-generated Panama type for InternalDocument lands." -->
```java title="Java"
import dev.kreuzberg.IDocumentExtractor;
import dev.kreuzberg.DocumentExtractorBridge;
// Java's Panama FFM binding exposes the IDocumentExtractor interface and the
// DocumentExtractorBridge.registerDocumentExtractor / unregisterDocumentExtractor
// helpers, but the InternalDocument return type is referenced from the
// interface signature without a corresponding generated Java class. Until the
// alef Java backend emits dev.kreuzberg.InternalDocument, custom Java
// DocumentExtractor implementations cannot return a value from extract_bytes /
// extract_file.
//
// Implement the extractor in Rust as `Plugin + DocumentExtractor` and register
// it via `register_document_extractor` in a Rust shim crate that links
// kreuzberg before the JVM loads the native library.
```

View File

@@ -0,0 +1,22 @@
```java title="Java"
import java.util.logging.Logger;
import java.util.logging.Level;
class MyPlugin implements PostProcessor {
private static final Logger logger = Logger.getLogger(MyPlugin.class.getName());
@Override
public ExtractionResult process(ExtractionResult result) {
logger.info("Processing " + result.mimeType() +
" (" + result.content().length() + " bytes)");
// Processing...
if (result.content().isEmpty()) {
logger.warning("Processing resulted in empty content");
}
return result;
}
}
```

View File

@@ -0,0 +1,37 @@
```java title="Java"
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.PostProcessor;
import org.junit.jupiter.api.Test;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.*;
class PostProcessorTest {
@Test
void testWordCountProcessor() {
PostProcessor processor = result -> {
long count = result.getContent().split("\\s+").length;
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
metadata.put("word_count", count);
return result;
};
ExtractionResult input = new ExtractionResult(
"Hello world test",
"text/plain",
new HashMap<>(),
java.util.List.of(),
java.util.List.of(),
java.util.List.of(),
java.util.List.of(),
true
);
ExtractionResult output = processor.process(input);
assertEquals(3, output.getMetadata().get("word_count"));
}
}
```

View File

@@ -0,0 +1,61 @@
```java title="Java"
import dev.kreuzberg.IValidator;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ValidatorBridge;
// Generic validator pattern: every IValidator has the same shape.
// name() keys the registry, priority() orders execution (higher = earlier),
// should_validate() is a fast skip-check, and validate() throws on failure.
public class GenericValidator implements IValidator {
private final String pluginName;
private final int pluginPriority;
public GenericValidator(String pluginName, int pluginPriority) {
this.pluginName = pluginName;
this.pluginPriority = pluginPriority;
}
@Override
public String name() {
return pluginName;
}
@Override
public String version() {
return "1.0.0";
}
@Override
public void initialize() {
// Optional: open resources, load config files, etc.
}
@Override
public void shutdown() {
// Optional: release resources held in initialize().
}
@Override
public void validate(ExtractionResult result, ExtractionConfig config) throws Exception {
if (result.content() == null || result.content().isBlank()) {
throw new IllegalArgumentException("Extracted content is blank");
}
}
@Override
public boolean should_validate(ExtractionResult _result, ExtractionConfig _config) {
return true;
}
@Override
public int priority() {
return pluginPriority;
}
public static void registerGenericValidator() {
GenericValidator validator = new GenericValidator("non-empty-content", 200);
ValidatorBridge.registerValidator(validator);
}
}
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
Validator qualityValidator = result -> {
double score = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
if (score < 0.5) {
throw new ValidationException(
String.format("Quality score too low: %.2f < 0.50", score)
);
}
};
```

View File

@@ -0,0 +1,27 @@
```java title="Java"
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
class StatefulPlugin implements PostProcessor {
// Use atomic types for simple counters
private final AtomicInteger callCount = new AtomicInteger(0);
// Use concurrent collections for complex state
private final ConcurrentHashMap<String, String> cache = new ConcurrentHashMap<>();
@Override
public ExtractionResult process(ExtractionResult result) {
// Increment counter atomically
callCount.incrementAndGet();
// Update cache (thread-safe)
cache.put("last_mime", result.mimeType());
return result;
}
public int getCallCount() {
return callCount.get();
}
}
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
try {
// Unregister specific plugins
Kreuzberg.unregisterPostProcessor("word-count");
Kreuzberg.unregisterValidator("min-length");
} catch (KreuzbergException e) {
System.err.println("Failed to unregister: " + e.getMessage());
}
```

View File

@@ -0,0 +1,31 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.PostProcessor;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class WordCountExample {
public static void main(String[] args) {
PostProcessor wordCount = result -> {
long count = result.getContent().split("\\s+").length;
Map<String, Object> metadata = new HashMap<>(result.getMetadata());
metadata.put("word_count", count);
return result;
};
try {
Kreuzberg.registerPostProcessor("word-count", wordCount, 50);
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Word count: " + result.getMetadata().get("word_count"));
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
}
}
```

View File

@@ -0,0 +1,19 @@
```java title="Java"
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1500)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("text-embedding-all-minilm-l6-v2")
.build())
.build())
.build())
.build();
```

View File

@@ -0,0 +1,19 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(500)
.maxOverlap(50)
.embedding("balanced")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
System.out.println("Content: " + result.getContent()
.substring(0, Math.min(100, result.getContent().length())) + "...");
```

View File

@@ -0,0 +1,12 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1024)
.maxOverlap(100)
.embedding("balanced")
.build())
.build();
```

View File

@@ -0,0 +1,4 @@
```java title="Java"
// Note: Keyword extraction is not yet available in Java bindings
// This feature requires the 'keywords' feature flag and is planned for a future release
```

View File

@@ -0,0 +1,20 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
if (qualityScore < 0.5) {
System.out.printf("Warning: Low quality extraction (%.2f)%n", qualityScore);
System.out.println("Consider re-scanning or adjusting OCR settings");
} else {
System.out.printf("Quality score: %.2f%n", qualityScore);
}
```

Some files were not shown because too many files have changed in this diff Show More