Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
```java title="Element-Based Output (Java)"
import io.kreuzberg.Kreuzberg;
import io.kreuzberg.ExtractionConfig;
import io.kreuzberg.ExtractionResult;
import io.kreuzberg.Element;
import io.kreuzberg.OutputFormat;
public class ElementBasedOutput {
public static void main(String[] args) {
// Configure element-based output
ExtractionConfig config = new ExtractionConfig();
config.setOutputFormat(OutputFormat.ELEMENT_BASED);
// Extract document
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
// Access elements
for (Element element : result.getElements()) {
System.out.println("Type: " + element.getElementType());
String text = element.getText();
if (text.length() > 100) {
text = text.substring(0, 100);
}
System.out.println("Text: " + text);
if (element.getMetadata().getPageNumber() != null) {
System.out.println("Page: " + element.getMetadata().getPageNumber());
}
if (element.getMetadata().getCoordinates() != null) {
var coords = element.getMetadata().getCoordinates();
System.out.printf("Coords: (%f, %f) - (%f, %f)%n",
coords.getLeft(), coords.getTop(),
coords.getRight(), coords.getBottom());
}
System.out.println("---");
}
// Filter by element type
result.getElements().stream()
.filter(e -> "title".equals(e.getElementType()))
.forEach(title -> {
String level = (String) title.getMetadata()
.getAdditional()
.getOrDefault("level", "unknown");
System.out.printf("[%s] %s%n", level, title.getText());
});
}
}
```

View File

@@ -0,0 +1,41 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.*;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(100)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
if (!result.getDetectedLanguages().isEmpty()) {
System.out.println("Languages: " + result.getDetectedLanguages());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,60 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.build();
```
```java title="Java - Markdown with Heading Context"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.HeadingContext;
import dev.kreuzberg.HeadingLevel;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.sizingTokenizer("Xenova/gpt-4o")
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
var headingContext = chunk.getMetadata().getHeadingContext();
if (headingContext.isPresent()) {
System.out.println("Headings:");
headingContext.get().getHeadings().forEach(heading ->
System.out.println(" Level " + heading.getLevel() + ": " + heading.getText())
);
}
});
```
```java title="Java - Prepend Heading Context"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.prependHeadingContext(true)
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
// Each chunk's content is prefixed with its heading breadcrumb
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())));
});
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
```

View File

@@ -0,0 +1,8 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Path;
public final class ConfigFileExample {
public static void main(String[] args) throws Exception {
ExtractionConfig config = Kreuzberg.loadExtractionConfigFromFile(Path.of("kreuzberg.toml"));
ExtractionResult result = Kreuzberg.extractFile(Path.of("document.pdf"), config);
System.out.printf("Detected MIME: %s%n", result.getMimeType());
}
}
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.tesseractConfig(TesseractConfig.builder()
.psm(3)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,31 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
public final class ProgrammaticConfigExample {
public static void main(String[] args) throws Exception {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.tesseractConfig(TesseractConfig.builder()
.psm(6)
.build())
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.printf("Content length: %d%n", result.getContent().length());
}
}
```

View File

@@ -0,0 +1,18 @@
```java title="Document Structure Config (Java)"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
ExtractionConfig config = ExtractionConfig.builder()
.includeDocumentStructure(true)
.build();
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
if (result.getDocumentStructure().isPresent()) {
var document = result.getDocumentStructure().get();
for (var node : document.nodes()) {
System.out.println("[" + node.content().nodeType() + "]");
}
}
```

View File

@@ -0,0 +1,53 @@
```java title="Element-Based Output (Java)"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Element;
import dev.kreuzberg.ResultFormat;
import java.nio.file.Path;
import java.util.List;
public class ElementBasedOutput {
public static void main(String[] args) throws Exception {
// Configure element-based output
ExtractionConfig config = ExtractionConfig.builder()
.withResultFormat(ResultFormat.ElementBased)
.build();
// Extract document
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
// Access elements
List<Element> elements = result.elements();
if (elements != null) {
for (Element element : elements) {
System.out.println("Type: " + element.elementType());
String text = element.text();
if (text.length() > 100) {
text = text.substring(0, 100);
}
System.out.println("Text: " + text);
if (element.metadata().pageNumber() != null) {
System.out.println("Page: " + element.metadata().pageNumber());
}
if (element.metadata().coordinates() != null) {
System.out.println("Coords: " + element.metadata().coordinates());
}
System.out.println("---");
}
// Filter by element type
elements.stream()
.filter(e -> "Title".equalsIgnoreCase(String.valueOf(e.elementType())))
.forEach(title -> {
String level = title.metadata().additional().getOrDefault("level", "unknown");
System.out.printf("[%s] %s%n", level, title.text());
});
}
}
}
```

View File

@@ -0,0 +1,87 @@
import kreuzberg.config.EmbeddingConfig;
import kreuzberg.config.EmbeddingModelType;
import kreuzberg.config.ChunkingConfig;
import kreuzberg.config.ExtractionConfig;
public class EmbeddingConfigExample {
public static void main(String[] args) {
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
EmbeddingConfig embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.batchSize(32)
.normalize(true)
.showDownloadProgress(true)
.cacheDir("~/.cache/kreuzberg/embeddings")
.build();
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384))
.batchSize(32)
.normalize(true)
.showDownloadProgress(true)
.cacheDir(null) // Uses default: .kreuzberg/embeddings/
.build();
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = EmbeddingConfig.builder()
.model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768))
.batchSize(16) // Larger model requires smaller batch size
.normalize(true)
.showDownloadProgress(true)
.cacheDir("/var/cache/embeddings")
.build();
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
ChunkingConfig chunkingConfig = ChunkingConfig.builder()
.maxChars(1024)
.maxOverlap(100)
.preset("balanced")
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.batchSize(32)
.normalize(true)
.build())
.build();
ExtractionConfig extractionConfig = ExtractionConfig.builder()
.chunking(chunkingConfig)
.build();
}
}
// Key parameter explanations:
//
// batchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// cacheDir: Where to store downloaded models
// - null: Uses .kreuzberg/embeddings/ in current directory
// - String path: Custom directory for model storage
//
// showDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,21 @@
```java title="Java"
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("all-mpnet-base-v2")
.build())
.batchSize(16)
.normalize(true)
.showDownloadProgress(true)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,90 @@
import kreuzberg.config.HierarchyConfig;
import kreuzberg.config.PdfConfig;
import kreuzberg.config.ExtractionConfig;
import kreuzberg.Kreuzberg;
public class HierarchyConfigExample {
public static void main(String[] args) throws Exception {
// Example 1: Basic hierarchy extraction
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
HierarchyConfig hierarchyConfigBasic = HierarchyConfig.builder()
.enabled(true)
.kClusters(6) // Default: creates 6 font size clusters (H1-H6 structure)
.includeBbox(true) // Include bounding box coordinates
.ocrCoverageThreshold(null) // No OCR coverage threshold
.build();
PdfConfig pdfConfigBasic = PdfConfig.builder()
.hierarchy(hierarchyConfigBasic)
.build();
ExtractionConfig extractionConfigBasic = ExtractionConfig.builder()
.pdfOptions(pdfConfigBasic)
.build();
Kreuzberg kreuzberg = new Kreuzberg(extractionConfigBasic);
// var result = kreuzberg.extractFileSync("document.pdf");
// Example 2: Custom kClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
HierarchyConfig hierarchyConfigMinimal = HierarchyConfig.builder()
.enabled(true)
.kClusters(3) // Minimal clustering: just 3 levels
.includeBbox(true)
.ocrCoverageThreshold(null)
.build();
PdfConfig pdfConfigMinimal = PdfConfig.builder()
.hierarchy(hierarchyConfigMinimal)
.build();
ExtractionConfig extractionConfigMinimal = ExtractionConfig.builder()
.pdfOptions(pdfConfigMinimal)
.build();
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
HierarchyConfig hierarchyConfigOcr = HierarchyConfig.builder()
.enabled(true)
.kClusters(6)
.includeBbox(true)
.ocrCoverageThreshold(0.5f) // Trigger OCR if text coverage < 50%
.build();
PdfConfig pdfConfigOcr = PdfConfig.builder()
.hierarchy(hierarchyConfigOcr)
.build();
ExtractionConfig extractionConfigOcr = ExtractionConfig.builder()
.pdfOptions(pdfConfigOcr)
.build();
}
}
// Field descriptions:
//
// enabled: boolean (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// kClusters: int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// includeBbox: boolean (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocrCoverageThreshold: Float (default: null)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
// - null means no OCR coverage-based triggering

View File

@@ -0,0 +1,27 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.HtmlOutputConfig;
import dev.kreuzberg.HtmlTheme;
import dev.kreuzberg.OutputFormat;
import java.nio.file.Path;
import java.util.Optional;
public class HtmlOutput {
public static void main(String[] args) throws Exception {
HtmlOutputConfig htmlOutput = HtmlOutputConfig.builder()
.withTheme(HtmlTheme.GitHub)
.withEmbedCss(true)
.build();
ExtractionConfig config = ExtractionConfig.builder()
.withOutputFormat(OutputFormat.Html)
.withHtmlOutput(Optional.of(htmlOutput))
.build();
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
System.out.println(result.content()); // HTML with kb-* classes
}
}
```

View File

@@ -0,0 +1,75 @@
import com.kreuzberg.Kreuzberg;
import com.kreuzberg.config.ExtractionConfig;
import com.kreuzberg.config.KeywordConfig;
import com.kreuzberg.keywords.YakeParams;
import com.kreuzberg.keywords.RakeParams;
import com.kreuzberg.result.ExtractionResult;
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
public class KeywordConfigExample {
public static void basicYake() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("yake")
.maxKeywords(10)
.minScore(0.0f)
.ngramRange(1, 3)
.language("en")
.yakeParams(null)
.rakeParams(null)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
public static void advancedYake() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("yake")
.maxKeywords(15)
.minScore(0.1f)
.ngramRange(1, 2)
.language("en")
.yakeParams(new YakeParams.Builder()
.windowSize(1)
.build())
.rakeParams(null)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
public static void rakeConfig() throws Exception {
ExtractionConfig config = new ExtractionConfig.Builder()
.keywords(new KeywordConfig.Builder()
.algorithm("rake")
.maxKeywords(10)
.minScore(5.0f)
.ngramRange(1, 3)
.language("en")
.yakeParams(null)
.rakeParams(new RakeParams.Builder()
.minWordLength(1)
.maxWordsPerPhrase(3)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println("Keywords: " + result.getKeywords());
}
public static void main(String[] args) throws Exception {
basicYake();
}
}

View File

@@ -0,0 +1,4 @@
```java title="Java"
// Note: Keyword extraction is not yet available in Java bindings
// This feature requires the 'keywords' feature flag and is planned for a future release
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.8)
.build())
.build();
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.ImagePreprocessingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.build())
.imagePreprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PdfConfig;
import dev.kreuzberg.HierarchyConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.extractImages(true)
.extractMetadata(true)
.passwords(Arrays.asList("password1", "password2"))
.hierarchyConfig(HierarchyConfig.builder().build())
.build())
.build();
```

View File

@@ -0,0 +1,17 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PdfConfig;
import dev.kreuzberg.HierarchyConfig;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.hierarchyConfig(HierarchyConfig.builder()
.enabled(true)
.detectionThreshold(0.75)
.ocrCoverageThreshold(0.8)
.minLevel(1)
.maxLevel(5)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.PostProcessorConfig;
import java.util.Arrays;
ExtractionConfig config = ExtractionConfig.builder()
.postprocessor(PostProcessorConfig.builder()
.enabled(true)
.enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
.disabledProcessors(Arrays.asList("mojibake_fix"))
.build())
.build();
```

View File

@@ -0,0 +1,7 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true) // Default
.build();
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.language("eng+fra+deu")
.tesseractConfig(TesseractConfig.builder()
.psm(6)
.oem(1)
.minConfidence(0.8)
.tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
.enableTableDetection(true)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,11 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.build();
```