Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
Import dev.kreuzberg.\*;
var config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkSize(500)
.overlap(50)
.build())
.pages(PageConfig.builder()
.extractPages(true)
.build())
.build();
var result = Kreuzberg.extractFileSync("document.pdf", config);
If (result.chunks() != null) {
for (var chunk : result.chunks()) {
if (chunk.metadata().firstPage() != null) {
var pageRange = chunk.metadata().firstPage().equals(chunk.metadata().lastPage())
? "Page " + chunk.metadata().firstPage()
: "Pages " + chunk.metadata().firstPage() + "-" + chunk.metadata().lastPage();
System.out.println("Chunk: " + chunk.text().substring(0, 50) +
"... (" + pageRange + ")");
}
}
}

View File

@@ -0,0 +1,36 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.PageConfig;
import java.nio.file.Path;
import java.util.Optional;
ExtractionConfig config = ExtractionConfig.builder()
.withChunking(Optional.of(ChunkingConfig.builder()
.withMaxCharacters(500L)
.withOverlap(50L)
.build()))
.withPages(Optional.of(PageConfig.builder()
.withExtractPages(true)
.build()))
.build();
var result = Kreuzberg.extractFileSync(Path.of("document.pdf"), config);
if (result.chunks() != null) {
for (var chunk : result.chunks()) {
Long firstPage = chunk.metadata().firstPage();
Long lastPage = chunk.metadata().lastPage();
if (firstPage != null && lastPage != null) {
String pageRange = firstPage.equals(lastPage)
? "Page " + firstPage
: "Pages " + firstPage + "-" + lastPage;
String content = chunk.content();
String preview = content.substring(0, Math.min(50, content.length()));
System.out.println("Chunk: " + preview + "... (" + pageRange + ")");
}
}
}
```

View File

@@ -0,0 +1,18 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("all-minilm-l6-v2"))
.normalize(true)
.batchSize(32)
.build())
.build())
.build();
```

View File

@@ -0,0 +1,35 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(500)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("all-mpnet-base-v2"))
.normalize(true)
.batchSize(16)
.build())
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
System.out.println("Found " + chunks.size() + " chunks for RAG pipeline");
for (int i = 0; i < Math.min(3, chunks.size()); i++) {
Object chunk = chunks.get(i);
System.out.println("Chunk " + i + ": " + chunk.toString().substring(0, Math.min(80, chunk.toString().length())) + "...");
}
} catch (Exception ex) {
System.err.println("RAG extraction failed: " + ex.getMessage());
}
```

View File

@@ -0,0 +1,38 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.showDownloadProgress(false)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
for (int index = 0; index < chunks.size(); index++) {
Object chunk = chunks.get(index);
String chunkId = "doc_chunk_" + index;
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
if (chunk instanceof java.util.Map) {
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
if (embedding != null) {
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
}
}
}
```

View File

@@ -0,0 +1,15 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KeywordConfig;
import dev.kreuzberg.KeywordAlgorithm;
ExtractionConfig config = ExtractionConfig.builder()
.keywords(KeywordConfig.builder()
.algorithm(KeywordAlgorithm.YAKE)
.maxKeywords(10)
.minScore(0.3)
.ngramRange(1, 3)
.language("en")
.build())
.build();
```

View File

@@ -0,0 +1,30 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KeywordConfig;
import dev.kreuzberg.KeywordAlgorithm;
import java.util.List;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.keywords(KeywordConfig.builder()
.algorithm(KeywordAlgorithm.YAKE)
.maxKeywords(10)
.minScore(0.3)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
if (metadata.containsKey("keywords")) {
List<Map<String, Object>> keywords = (List<Map<String, Object>>) metadata.get("keywords");
for (Map<String, Object> kw : keywords) {
String text = (String) kw.get("text");
Double score = ((Number) kw.get("score")).doubleValue();
System.out.println(text + ": " + String.format("%.3f", score));
}
}
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
import java.math.BigDecimal;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(new BigDecimal("0.8"))
.detectMultiple(false)
.build())
.build();
```

View File

@@ -0,0 +1,35 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.LanguageDetectionConfig;
import java.math.BigDecimal;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(new BigDecimal("0.8"))
.detectMultiple(true)
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
List<String> languages = result.getDetectedLanguages() != null
? result.getDetectedLanguages()
: List.of();
if (!languages.isEmpty()) {
System.out.println("Detected " + languages.size() + " language(s): " + String.join(", ", languages));
} else {
System.out.println("No languages detected");
}
System.out.println("Total content: " + result.getContent().length() + " characters");
System.out.println("MIME type: " + result.getMimeType());
} catch (Exception ex) {
System.err.println("Processing failed: " + ex.getMessage());
}
```

View File

@@ -0,0 +1,7 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
```

View File

@@ -0,0 +1,21 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
if (qualityScore < 0.5) {
System.out.println(String.format("Warning: Low quality extraction (%.2f)", qualityScore));
System.out.println("Consider re-scanning with higher DPI or adjusting OCR settings");
} else {
System.out.println(String.format("Quality score: %.2f", qualityScore));
}
```

View File

@@ -0,0 +1,13 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveMarkdown(true)
.preserveCode(true)
.languageHint("eng")
.build())
.build();
```

View File

@@ -0,0 +1,33 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.TokenReductionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveMarkdown(true)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("verbose_document.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
int original = metadata.containsKey("original_token_count")
? ((Number) metadata.get("original_token_count")).intValue()
: 0;
int reduced = metadata.containsKey("token_count")
? ((Number) metadata.get("token_count")).intValue()
: 0;
double ratio = metadata.containsKey("token_reduction_ratio")
? ((Number) metadata.get("token_reduction_ratio")).doubleValue()
: 0.0;
System.out.println("Reduced from " + original + " to " + reduced + " tokens");
System.out.println(String.format("Reduction: %.1f%%", ratio * 100));
```

View File

@@ -0,0 +1,67 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class VectorDatabaseIntegration {
public static class VectorRecord {
public String id;
public float[] embedding;
public String content;
public Map<String, String> metadata;
}
public static List<VectorRecord> extractAndVectorize(String documentPath, String documentId) throws Exception {
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile(documentPath, config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
List<VectorRecord> vectorRecords = new java.util.ArrayList<>();
for (int index = 0; index < chunks.size(); index++) {
VectorRecord record = new VectorRecord();
record.id = documentId + "_chunk_" + index;
record.metadata = new HashMap<>();
record.metadata.put("document_id", documentId);
record.metadata.put("chunk_index", String.valueOf(index));
if (chunk instanceof java.util.Map) {
Map<String, Object> chunkMap = (Map<String, Object>) chunks.get(index);
record.content = (String) chunkMap.get("content");
record.embedding = (float[]) chunkMap.get("embedding");
record.metadata.put("content_length", String.valueOf(record.content.length()));
}
vectorRecords.add(record);
}
storeInVectorDatabase(vectorRecords);
return vectorRecords;
}
private static void storeInVectorDatabase(List<VectorRecord> records) {
for (VectorRecord record : records) {
if (record.embedding != null && record.embedding.length > 0) {
System.out.println("Storing " + record.id + ": " + record.content.length()
+ " chars, " + record.embedding.length + " dims");
}
}
}
}
```