Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
```java title="Java"
import dev.kreuzberg.*;
import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.lang.foreign.ValueLayout;
import java.net.http.*;
import java.net.URI;
public class CloudOcrExample {
public static void main(String[] args) {
Arena callbackArena = Arena.ofAuto();
String apiKey = "your-api-key";
OcrBackend cloudOcr = (imageBytes, imageLength, configJson) -> {
try {
// Read image bytes from native memory
byte[] image = imageBytes.reinterpret(imageLength)
.toArray(ValueLayout.JAVA_BYTE);
// Read config JSON
String config = configJson.reinterpret(Long.MAX_VALUE)
.getString(0);
// Call cloud OCR API
HttpClient client = HttpClient.newHttpClient();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create("https://api.example.com/ocr"))
.header("Authorization", "Bearer " + apiKey)
.POST(HttpRequest.BodyPublishers.ofByteArray(image))
.build();
HttpResponse<String> response = client.send(request,
HttpResponse.BodyHandlers.ofString());
String text = parseTextFromResponse(response.body());
// Return result as C string
return callbackArena.allocateFrom(text);
} catch (Exception e) {
return MemorySegment.NULL;
}
};
try (Arena arena = Arena.ofConfined()) {
Kreuzberg.registerOcrBackend("cloud-ocr", cloudOcr, arena);
// Use custom OCR backend in extraction
// Note: Requires ExtractionConfig with OCR enabled
ExtractionResult result = Kreuzberg.extractFileSync("scanned.pdf");
} catch (Exception e) {
e.printStackTrace();
}
}
private static String parseTextFromResponse(String json) {
// Parse JSON response and extract text field
return json; // Simplified
}
}
```

View File

@@ -0,0 +1,14 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ImageExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.imageExtraction(ImageExtractionConfig.builder()
.extractImages(true)
.targetDpi(200)
.maxImageDimension(2048)
.injectPlaceholders(true) // set to false to extract images without markdown references
.autoAdjustDpi(true)
.build())
.build();
```

View File

@@ -0,0 +1,20 @@
```java title="Java"
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ImagePreprocessingConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.tesseractConfig(TesseractConfig.builder()
.preprocessing(ImagePreprocessingConfig.builder()
.targetDpi(300)
.denoise(true)
.deskew(true)
.contrastEnhance(true)
.binarizationMethod("otsu")
.build())
.build())
.build())
.build();
```

View File

@@ -0,0 +1 @@
EasyOCR is only available in Python.

View File

@@ -0,0 +1,38 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.types.OcrElement;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("paddle-ocr")
.language("en")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
if (result.getOcrElements() != null) {
for (OcrElement element : result.getOcrElements()) {
System.out.printf("Text: %s%n", element.getText());
System.out.printf("Confidence: %.2f%n", element.getConfidence().getRecognition());
System.out.printf("Geometry: %s%n", element.getGeometry());
if (element.getRotation() != null) {
System.out.printf("Rotation: %.1f°%n", element.getRotation().getAngle());
}
System.out.println();
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,26 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.build())
.forceOcr(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.println(result.getContent());
```

View File

@@ -0,0 +1,16 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu+fra")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("multilingual.pdf", config);
System.out.println(result.getContent());
```

View File

@@ -0,0 +1,27 @@
```java title="Java"
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("paddle-ocr")
.language("en")
// .paddleOcrConfig(PaddleOcrConfig.builder().modelTier("server").build()) // for max accuracy
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
```