This commit is contained in:
52
docs/snippets/java/config/ElementBasedOutput.md
Normal file
52
docs/snippets/java/config/ElementBasedOutput.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```java title="Element-Based Output (Java)"
|
||||
import io.kreuzberg.Kreuzberg;
|
||||
import io.kreuzberg.ExtractionConfig;
|
||||
import io.kreuzberg.ExtractionResult;
|
||||
import io.kreuzberg.Element;
|
||||
import io.kreuzberg.OutputFormat;
|
||||
|
||||
public class ElementBasedOutput {
|
||||
public static void main(String[] args) {
|
||||
// Configure element-based output
|
||||
ExtractionConfig config = new ExtractionConfig();
|
||||
config.setOutputFormat(OutputFormat.ELEMENT_BASED);
|
||||
|
||||
// Extract document
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
// Access elements
|
||||
for (Element element : result.getElements()) {
|
||||
System.out.println("Type: " + element.getElementType());
|
||||
|
||||
String text = element.getText();
|
||||
if (text.length() > 100) {
|
||||
text = text.substring(0, 100);
|
||||
}
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
if (element.getMetadata().getPageNumber() != null) {
|
||||
System.out.println("Page: " + element.getMetadata().getPageNumber());
|
||||
}
|
||||
|
||||
if (element.getMetadata().getCoordinates() != null) {
|
||||
var coords = element.getMetadata().getCoordinates();
|
||||
System.out.printf("Coords: (%f, %f) - (%f, %f)%n",
|
||||
coords.getLeft(), coords.getTop(),
|
||||
coords.getRight(), coords.getBottom());
|
||||
}
|
||||
|
||||
System.out.println("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
result.getElements().stream()
|
||||
.filter(e -> "title".equals(e.getElementType()))
|
||||
.forEach(title -> {
|
||||
String level = (String) title.getMetadata()
|
||||
.getAdditional()
|
||||
.getOrDefault("level", "unknown");
|
||||
System.out.printf("[%s] %s%n", level, title.getText());
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
41
docs/snippets/java/config/advanced_config.md
Normal file
41
docs/snippets/java/config/advanced_config.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.*;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu")
|
||||
.build())
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(100)
|
||||
.build())
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveImportantWords(true)
|
||||
.build())
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.build())
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
if (!result.getDetectedLanguages().isEmpty()) {
|
||||
System.out.println("Languages: " + result.getDetectedLanguages());
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/java/config/chunking_config.md
Normal file
60
docs/snippets/java/config/chunking_config.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
|
||||
```java title="Java - Markdown with Heading Context"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.HeadingContext;
|
||||
import dev.kreuzberg.HeadingLevel;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkerType("markdown")
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.sizingTokenizer("Xenova/gpt-4o")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
|
||||
|
||||
result.getChunks().forEach(chunk -> {
|
||||
var headingContext = chunk.getMetadata().getHeadingContext();
|
||||
if (headingContext.isPresent()) {
|
||||
System.out.println("Headings:");
|
||||
headingContext.get().getHeadings().forEach(heading ->
|
||||
System.out.println(" Level " + heading.getLevel() + ": " + heading.getText())
|
||||
);
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
```java title="Java - Prepend Heading Context"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.chunkerType("markdown")
|
||||
.maxChars(500)
|
||||
.maxOverlap(50)
|
||||
.prependHeadingContext(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
|
||||
|
||||
result.getChunks().forEach(chunk -> {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())));
|
||||
});
|
||||
```
|
||||
11
docs/snippets/java/config/config_basic.md
Normal file
11
docs/snippets/java/config/config_basic.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
```
|
||||
8
docs/snippets/java/config/config_discover.md
Normal file
8
docs/snippets/java/config/config_discover.md
Normal file
@@ -0,0 +1,8 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
```
|
||||
14
docs/snippets/java/config/config_file.md
Normal file
14
docs/snippets/java/config/config_file.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public final class ConfigFileExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionConfig config = Kreuzberg.loadExtractionConfigFromFile(Path.of("kreuzberg.toml"));
|
||||
ExtractionResult result = Kreuzberg.extractFile(Path.of("document.pdf"), config);
|
||||
System.out.printf("Detected MIME: %s%n", result.getMimeType());
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/java/config/config_ocr.md
Normal file
15
docs/snippets/java/config/config_ocr.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+fra")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(3)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
31
docs/snippets/java/config/config_programmatic.md
Normal file
31
docs/snippets/java/config/config_programmatic.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
public final class ProgrammaticConfigExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(6)
|
||||
.build())
|
||||
.build())
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.maxOverlap(200)
|
||||
.build())
|
||||
.useCache(true)
|
||||
.enableQualityProcessing(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.printf("Content length: %d%n", result.getContent().length());
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/config/document_structure_config.md
Normal file
18
docs/snippets/java/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Document Structure Config (Java)"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.includeDocumentStructure(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
if (result.getDocumentStructure().isPresent()) {
|
||||
var document = result.getDocumentStructure().get();
|
||||
for (var node : document.nodes()) {
|
||||
System.out.println("[" + node.content().nodeType() + "]");
|
||||
}
|
||||
}
|
||||
```
|
||||
53
docs/snippets/java/config/element_based_output.md
Normal file
53
docs/snippets/java/config/element_based_output.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```java title="Element-Based Output (Java)"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Element;
|
||||
import dev.kreuzberg.ResultFormat;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class ElementBasedOutput {
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Configure element-based output
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withResultFormat(ResultFormat.ElementBased)
|
||||
.build();
|
||||
|
||||
// Extract document
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
|
||||
// Access elements
|
||||
List<Element> elements = result.elements();
|
||||
if (elements != null) {
|
||||
for (Element element : elements) {
|
||||
System.out.println("Type: " + element.elementType());
|
||||
|
||||
String text = element.text();
|
||||
if (text.length() > 100) {
|
||||
text = text.substring(0, 100);
|
||||
}
|
||||
System.out.println("Text: " + text);
|
||||
|
||||
if (element.metadata().pageNumber() != null) {
|
||||
System.out.println("Page: " + element.metadata().pageNumber());
|
||||
}
|
||||
|
||||
if (element.metadata().coordinates() != null) {
|
||||
System.out.println("Coords: " + element.metadata().coordinates());
|
||||
}
|
||||
|
||||
System.out.println("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
elements.stream()
|
||||
.filter(e -> "Title".equalsIgnoreCase(String.valueOf(e.elementType())))
|
||||
.forEach(title -> {
|
||||
String level = title.metadata().additional().getOrDefault("level", "unknown");
|
||||
System.out.printf("[%s] %s%n", level, title.text());
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
87
docs/snippets/java/config/embedding_config.java
Normal file
87
docs/snippets/java/config/embedding_config.java
Normal file
@@ -0,0 +1,87 @@
|
||||
import kreuzberg.config.EmbeddingConfig;
|
||||
import kreuzberg.config.EmbeddingModelType;
|
||||
import kreuzberg.config.ChunkingConfig;
|
||||
import kreuzberg.config.ExtractionConfig;
|
||||
|
||||
public class EmbeddingConfigExample {
|
||||
public static void main(String[] args) {
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
EmbeddingConfig embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir("~/.cache/kreuzberg/embeddings")
|
||||
.build();
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.custom("BAAI/bge-small-en-v1.5", 384))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir(null) // Uses default: .kreuzberg/embeddings/
|
||||
.build();
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.custom("sentence-transformers/all-mpnet-base-v2", 768))
|
||||
.batchSize(16) // Larger model requires smaller batch size
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.cacheDir("/var/cache/embeddings")
|
||||
.build();
|
||||
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
ChunkingConfig chunkingConfig = ChunkingConfig.builder()
|
||||
.maxChars(1024)
|
||||
.maxOverlap(100)
|
||||
.preset("balanced")
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.preset("balanced"))
|
||||
.batchSize(32)
|
||||
.normalize(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfig = ExtractionConfig.builder()
|
||||
.chunking(chunkingConfig)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// batchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// cacheDir: Where to store downloaded models
|
||||
// - null: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - String path: Custom directory for model storage
|
||||
//
|
||||
// showDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
21
docs/snippets/java/config/embedding_config.md
Normal file
21
docs/snippets/java/config/embedding_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
import dev.kreuzberg.EmbeddingConfig;
|
||||
import dev.kreuzberg.EmbeddingModelType;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(1000)
|
||||
.embedding(EmbeddingConfig.builder()
|
||||
.model(EmbeddingModelType.builder()
|
||||
.type("preset")
|
||||
.name("all-mpnet-base-v2")
|
||||
.build())
|
||||
.batchSize(16)
|
||||
.normalize(true)
|
||||
.showDownloadProgress(true)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
90
docs/snippets/java/config/hierarchy_config.java
Normal file
90
docs/snippets/java/config/hierarchy_config.java
Normal file
@@ -0,0 +1,90 @@
|
||||
import kreuzberg.config.HierarchyConfig;
|
||||
import kreuzberg.config.PdfConfig;
|
||||
import kreuzberg.config.ExtractionConfig;
|
||||
import kreuzberg.Kreuzberg;
|
||||
|
||||
public class HierarchyConfigExample {
|
||||
public static void main(String[] args) throws Exception {
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
HierarchyConfig hierarchyConfigBasic = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(6) // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
.includeBbox(true) // Include bounding box coordinates
|
||||
.ocrCoverageThreshold(null) // No OCR coverage threshold
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigBasic = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigBasic)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigBasic = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigBasic)
|
||||
.build();
|
||||
|
||||
Kreuzberg kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// var result = kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
|
||||
// Example 2: Custom kClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
HierarchyConfig hierarchyConfigMinimal = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(3) // Minimal clustering: just 3 levels
|
||||
.includeBbox(true)
|
||||
.ocrCoverageThreshold(null)
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigMinimal = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigMinimal)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigMinimal = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigMinimal)
|
||||
.build();
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
HierarchyConfig hierarchyConfigOcr = HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.kClusters(6)
|
||||
.includeBbox(true)
|
||||
.ocrCoverageThreshold(0.5f) // Trigger OCR if text coverage < 50%
|
||||
.build();
|
||||
|
||||
PdfConfig pdfConfigOcr = PdfConfig.builder()
|
||||
.hierarchy(hierarchyConfigOcr)
|
||||
.build();
|
||||
|
||||
ExtractionConfig extractionConfigOcr = ExtractionConfig.builder()
|
||||
.pdfOptions(pdfConfigOcr)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: boolean (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// kClusters: int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// includeBbox: boolean (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocrCoverageThreshold: Float (default: null)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
|
||||
// - null means no OCR coverage-based triggering
|
||||
27
docs/snippets/java/config/html_output.md
Normal file
27
docs/snippets/java/config/html_output.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.HtmlOutputConfig;
|
||||
import dev.kreuzberg.HtmlTheme;
|
||||
import dev.kreuzberg.OutputFormat;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class HtmlOutput {
|
||||
public static void main(String[] args) throws Exception {
|
||||
HtmlOutputConfig htmlOutput = HtmlOutputConfig.builder()
|
||||
.withTheme(HtmlTheme.GitHub)
|
||||
.withEmbedCss(true)
|
||||
.build();
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.withOutputFormat(OutputFormat.Html)
|
||||
.withHtmlOutput(Optional.of(htmlOutput))
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
|
||||
System.out.println(result.content()); // HTML with kb-* classes
|
||||
}
|
||||
}
|
||||
```
|
||||
75
docs/snippets/java/config/keyword_config.java
Normal file
75
docs/snippets/java/config/keyword_config.java
Normal file
@@ -0,0 +1,75 @@
|
||||
import com.kreuzberg.Kreuzberg;
|
||||
import com.kreuzberg.config.ExtractionConfig;
|
||||
import com.kreuzberg.config.KeywordConfig;
|
||||
import com.kreuzberg.keywords.YakeParams;
|
||||
import com.kreuzberg.keywords.RakeParams;
|
||||
import com.kreuzberg.result.ExtractionResult;
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
public class KeywordConfigExample {
|
||||
|
||||
public static void basicYake() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("yake")
|
||||
.maxKeywords(10)
|
||||
.minScore(0.0f)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.yakeParams(null)
|
||||
.rakeParams(null)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
public static void advancedYake() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("yake")
|
||||
.maxKeywords(15)
|
||||
.minScore(0.1f)
|
||||
.ngramRange(1, 2)
|
||||
.language("en")
|
||||
.yakeParams(new YakeParams.Builder()
|
||||
.windowSize(1)
|
||||
.build())
|
||||
.rakeParams(null)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
public static void rakeConfig() throws Exception {
|
||||
ExtractionConfig config = new ExtractionConfig.Builder()
|
||||
.keywords(new KeywordConfig.Builder()
|
||||
.algorithm("rake")
|
||||
.maxKeywords(10)
|
||||
.minScore(5.0f)
|
||||
.ngramRange(1, 3)
|
||||
.language("en")
|
||||
.yakeParams(null)
|
||||
.rakeParams(new RakeParams.Builder()
|
||||
.minWordLength(1)
|
||||
.maxWordsPerPhrase(3)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println("Keywords: " + result.getKeywords());
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
basicYake();
|
||||
}
|
||||
}
|
||||
4
docs/snippets/java/config/keyword_extraction_config.md
Normal file
4
docs/snippets/java/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,4 @@
|
||||
```java title="Java"
|
||||
// Note: Keyword extraction is not yet available in Java bindings
|
||||
// This feature requires the 'keywords' feature flag and is planned for a future release
|
||||
```
|
||||
11
docs/snippets/java/config/language_detection_config.md
Normal file
11
docs/snippets/java/config/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.8)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
18
docs/snippets/java/config/ocr_dpi_config.md
Normal file
18
docs/snippets/java/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.ImagePreprocessingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.build())
|
||||
.imagePreprocessing(ImagePreprocessingConfig.builder()
|
||||
.targetDpi(300)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
```
|
||||
15
docs/snippets/java/config/pdf_config.md
Normal file
15
docs/snippets/java/config/pdf_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PdfConfig;
|
||||
import dev.kreuzberg.HierarchyConfig;
|
||||
import java.util.Arrays;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.pdfOptions(PdfConfig.builder()
|
||||
.extractImages(true)
|
||||
.extractMetadata(true)
|
||||
.passwords(Arrays.asList("password1", "password2"))
|
||||
.hierarchyConfig(HierarchyConfig.builder().build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
17
docs/snippets/java/config/pdf_hierarchy_config.md
Normal file
17
docs/snippets/java/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PdfConfig;
|
||||
import dev.kreuzberg.HierarchyConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.pdfOptions(PdfConfig.builder()
|
||||
.hierarchyConfig(HierarchyConfig.builder()
|
||||
.enabled(true)
|
||||
.detectionThreshold(0.75)
|
||||
.ocrCoverageThreshold(0.8)
|
||||
.minLevel(1)
|
||||
.maxLevel(5)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
13
docs/snippets/java/config/postprocessor_config.md
Normal file
13
docs/snippets/java/config/postprocessor_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.PostProcessorConfig;
|
||||
import java.util.Arrays;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.postprocessor(PostProcessorConfig.builder()
|
||||
.enabled(true)
|
||||
.enabledProcessors(Arrays.asList("deduplication", "whitespace_normalization"))
|
||||
.disabledProcessors(Arrays.asList("mojibake_fix"))
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
7
docs/snippets/java/config/quality_processing_config.md
Normal file
7
docs/snippets/java/config/quality_processing_config.md
Normal file
@@ -0,0 +1,7 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.enableQualityProcessing(true) // Default
|
||||
.build();
|
||||
```
|
||||
18
docs/snippets/java/config/tesseract_config.md
Normal file
18
docs/snippets/java/config/tesseract_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.language("eng+fra+deu")
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.psm(6)
|
||||
.oem(1)
|
||||
.minConfidence(0.8)
|
||||
.tesseditCharWhitelist("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?")
|
||||
.enableTableDetection(true)
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
11
docs/snippets/java/config/token_reduction_config.md
Normal file
11
docs/snippets/java/config/token_reduction_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.TokenReductionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.tokenReduction(TokenReductionConfig.builder()
|
||||
.mode("moderate")
|
||||
.preserveImportantWords(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
Reference in New Issue
Block a user