This commit is contained in:
25
docs/snippets/java/metadata/PageBoundaries.md
Normal file
25
docs/snippets/java/metadata/PageBoundaries.md
Normal file
@@ -0,0 +1,25 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
If (result.metadata().pages() != null &&
|
||||
result.metadata().pages().boundaries() != null) {
|
||||
|
||||
var contentBytes = result.content().getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
for (var boundary : result.metadata().pages().boundaries().subList(0, 3)) {
|
||||
var pageBytes = Arrays.copyOfRange(
|
||||
contentBytes,
|
||||
boundary.byteStart(),
|
||||
boundary.byteEnd()
|
||||
);
|
||||
var pageText = new String(pageBytes, StandardCharsets.UTF_8);
|
||||
|
||||
System.out.println("Page " + boundary.pageNumber() + ":");
|
||||
System.out.println(" Byte range: " + boundary.byteStart() +
|
||||
"-" + boundary.byteEnd());
|
||||
System.out.println(" Preview: " + pageText.substring(0, 100) + "...");
|
||||
}
|
||||
|
||||
}
|
||||
18
docs/snippets/java/metadata/PageTrackingBasic.md
Normal file
18
docs/snippets/java/metadata/PageTrackingBasic.md
Normal file
@@ -0,0 +1,18 @@
|
||||
Import dev.kreuzberg.\*;
|
||||
|
||||
var config = ExtractionConfig.builder()
|
||||
.pages(PageConfig.builder()
|
||||
.extractPages(true)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
var result = Kreuzberg.extractFileSync("document.pdf", config);
|
||||
|
||||
If (result.pages() != null) {
|
||||
for (var page : result.pages()) {
|
||||
System.out.println("Page " + page.pageNumber() + ":");
|
||||
System.out.println(" Content: " + page.content().length() + " chars");
|
||||
System.out.println(" Tables: " + page.tables().size());
|
||||
System.out.println(" Images: " + page.images().size());
|
||||
}
|
||||
}
|
||||
12
docs/snippets/java/metadata/language_detection.md
Normal file
12
docs/snippets/java/metadata/language_detection.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.9)
|
||||
.detectMultiple(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
@@ -0,0 +1,17 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.LanguageDetectionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.languageDetection(LanguageDetectionConfig.builder()
|
||||
.enabled(true)
|
||||
.minConfidence(0.8)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
|
||||
|
||||
System.out.println("Detected languages: " + result.getDetectedLanguages());
|
||||
```
|
||||
111
docs/snippets/java/metadata/metadata.md
Normal file
111
docs/snippets/java/metadata/metadata.md
Normal file
@@ -0,0 +1,111 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.Metadata;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
|
||||
|
||||
// Metadata is flat — format-specific fields are at the top level
|
||||
Metadata metadata = result.getMetadata();
|
||||
metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
|
||||
metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));
|
||||
|
||||
// Format-specific fields are in the additional map
|
||||
Map<String, Object> extra = metadata.getAdditional();
|
||||
if (extra.get("page_count") != null) {
|
||||
System.out.println("Pages: " + extra.get("page_count"));
|
||||
}
|
||||
|
||||
// Access HTML metadata
|
||||
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
|
||||
Metadata htmlMeta = htmlResult.getMetadata();
|
||||
htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));
|
||||
|
||||
Map<String, Object> htmlExtra = htmlMeta.getAdditional();
|
||||
String description = (String) htmlExtra.get("description");
|
||||
if (description != null) {
|
||||
System.out.println("Description: " + description);
|
||||
}
|
||||
|
||||
// Access keywords as array
|
||||
htmlMeta.getKeywords().ifPresent(keywords ->
|
||||
System.out.println("Keywords: " + keywords));
|
||||
|
||||
// Access canonical URL (renamed from canonical)
|
||||
String canonicalUrl = (String) htmlExtra.get("canonical_url");
|
||||
if (canonicalUrl != null) {
|
||||
System.out.println("Canonical URL: " + canonicalUrl);
|
||||
}
|
||||
|
||||
// Access Open Graph fields from map
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
|
||||
if (openGraph != null) {
|
||||
System.out.println("Open Graph Image: " + openGraph.get("image"));
|
||||
System.out.println("Open Graph Title: " + openGraph.get("title"));
|
||||
System.out.println("Open Graph Type: " + openGraph.get("type"));
|
||||
}
|
||||
|
||||
// Access Twitter Card fields from map
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
|
||||
if (twitterCard != null) {
|
||||
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
|
||||
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
|
||||
}
|
||||
|
||||
// Access new fields
|
||||
htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));
|
||||
|
||||
String textDirection = (String) htmlExtra.get("text_direction");
|
||||
if (textDirection != null) {
|
||||
System.out.println("Text Direction: " + textDirection);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
|
||||
if (headers != null) {
|
||||
headers.stream()
|
||||
.map(h -> h.get("text"))
|
||||
.forEach(text -> System.out.print(text + ", "));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
// Access links
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
|
||||
if (links != null) {
|
||||
for (Map<String, Object> link : links) {
|
||||
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
|
||||
}
|
||||
}
|
||||
|
||||
// Access images
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
|
||||
if (images != null) {
|
||||
for (Map<String, Object> image : images) {
|
||||
System.out.println("Image: " + image.get("src"));
|
||||
}
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
@SuppressWarnings("unchecked")
|
||||
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
|
||||
if (structuredData != null) {
|
||||
System.out.println("Structured data items: " + structuredData.size());
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/java/metadata/tables.md
Normal file
27
docs/snippets/java/metadata/tables.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.Table;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
|
||||
|
||||
for (Table table : result.getTables()) {
|
||||
System.out.println("Table with " + table.cells().size() + " rows");
|
||||
System.out.println(table.markdown());
|
||||
|
||||
for (List<String> row : table.cells()) {
|
||||
System.out.println(row);
|
||||
}
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
18
docs/snippets/java/metadata/vector_database_integration.md
Normal file
18
docs/snippets/java/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ChunkingConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.chunking(ChunkingConfig.builder()
|
||||
.maxChars(512)
|
||||
.maxOverlap(50)
|
||||
.embedding("balanced")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
|
||||
System.out.println("Extracted content: " + result.getContent().length() + " characters");
|
||||
```
|
||||
Reference in New Issue
Block a user