1.3 KiB
1.3 KiB
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.showDownloadProgress(false)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
for (int index = 0; index < chunks.size(); index++) {
Object chunk = chunks.get(index);
String chunkId = "doc_chunk_" + index;
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
if (chunk instanceof java.util.Map) {
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
if (embedding != null) {
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
}
}
}