This commit is contained in:
60
docs/snippets/java/ocr/cloud_ocr_backend.md
Normal file
60
docs/snippets/java/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.*;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.lang.foreign.MemorySegment;
|
||||
import java.lang.foreign.ValueLayout;
|
||||
import java.net.http.*;
|
||||
import java.net.URI;
|
||||
|
||||
public class CloudOcrExample {
|
||||
public static void main(String[] args) {
|
||||
Arena callbackArena = Arena.ofAuto();
|
||||
String apiKey = "your-api-key";
|
||||
|
||||
OcrBackend cloudOcr = (imageBytes, imageLength, configJson) -> {
|
||||
try {
|
||||
// Read image bytes from native memory
|
||||
byte[] image = imageBytes.reinterpret(imageLength)
|
||||
.toArray(ValueLayout.JAVA_BYTE);
|
||||
|
||||
// Read config JSON
|
||||
String config = configJson.reinterpret(Long.MAX_VALUE)
|
||||
.getString(0);
|
||||
|
||||
// Call cloud OCR API
|
||||
HttpClient client = HttpClient.newHttpClient();
|
||||
HttpRequest request = HttpRequest.newBuilder()
|
||||
.uri(URI.create("https://api.example.com/ocr"))
|
||||
.header("Authorization", "Bearer " + apiKey)
|
||||
.POST(HttpRequest.BodyPublishers.ofByteArray(image))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request,
|
||||
HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
String text = parseTextFromResponse(response.body());
|
||||
|
||||
// Return result as C string
|
||||
return callbackArena.allocateFrom(text);
|
||||
} catch (Exception e) {
|
||||
return MemorySegment.NULL;
|
||||
}
|
||||
};
|
||||
|
||||
try (Arena arena = Arena.ofConfined()) {
|
||||
Kreuzberg.registerOcrBackend("cloud-ocr", cloudOcr, arena);
|
||||
|
||||
// Use custom OCR backend in extraction
|
||||
// Note: Requires ExtractionConfig with OCR enabled
|
||||
ExtractionResult result = Kreuzberg.extractFileSync("scanned.pdf");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static String parseTextFromResponse(String json) {
|
||||
// Parse JSON response and extract text field
|
||||
return json; // Simplified
|
||||
}
|
||||
}
|
||||
```
|
||||
14
docs/snippets/java/ocr/image_extraction.md
Normal file
14
docs/snippets/java/ocr/image_extraction.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ImageExtractionConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.imageExtraction(ImageExtractionConfig.builder()
|
||||
.extractImages(true)
|
||||
.targetDpi(200)
|
||||
.maxImageDimension(2048)
|
||||
.injectPlaceholders(true) // set to false to extract images without markdown references
|
||||
.autoAdjustDpi(true)
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
20
docs/snippets/java/ocr/image_preprocessing.md
Normal file
20
docs/snippets/java/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.ImagePreprocessingConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.TesseractConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.tesseractConfig(TesseractConfig.builder()
|
||||
.preprocessing(ImagePreprocessingConfig.builder()
|
||||
.targetDpi(300)
|
||||
.denoise(true)
|
||||
.deskew(true)
|
||||
.contrastEnhance(true)
|
||||
.binarizationMethod("otsu")
|
||||
.build())
|
||||
.build())
|
||||
.build())
|
||||
.build();
|
||||
```
|
||||
1
docs/snippets/java/ocr/ocr_easyocr.md
Normal file
1
docs/snippets/java/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1 @@
|
||||
EasyOCR is only available in Python.
|
||||
38
docs/snippets/java/ocr/ocr_elements.md
Normal file
38
docs/snippets/java/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import dev.kreuzberg.types.OcrElement;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("paddle-ocr")
|
||||
.language("en")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
|
||||
if (result.getOcrElements() != null) {
|
||||
for (OcrElement element : result.getOcrElements()) {
|
||||
System.out.printf("Text: %s%n", element.getText());
|
||||
System.out.printf("Confidence: %.2f%n", element.getConfidence().getRecognition());
|
||||
System.out.printf("Geometry: %s%n", element.getGeometry());
|
||||
if (element.getRotation() != null) {
|
||||
System.out.printf("Rotation: %.1f°%n", element.getRotation().getAngle());
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/java/ocr/ocr_extraction.md
Normal file
26
docs/snippets/java/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
16
docs/snippets/java/ocr/ocr_force_all_pages.md
Normal file
16
docs/snippets/java/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.build())
|
||||
.forceOcr(true)
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
```
|
||||
16
docs/snippets/java/ocr/ocr_multi_language.md
Normal file
16
docs/snippets/java/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("tesseract")
|
||||
.language("eng+deu+fra")
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("multilingual.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
```
|
||||
27
docs/snippets/java/ocr/ocr_paddleocr.md
Normal file
27
docs/snippets/java/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```java title="Java"
|
||||
import dev.kreuzberg.Kreuzberg;
|
||||
import dev.kreuzberg.ExtractionResult;
|
||||
import dev.kreuzberg.KreuzbergException;
|
||||
import dev.kreuzberg.ExtractionConfig;
|
||||
import dev.kreuzberg.OcrConfig;
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
ExtractionConfig config = ExtractionConfig.builder()
|
||||
.ocr(OcrConfig.builder()
|
||||
.backend("paddle-ocr")
|
||||
.language("en")
|
||||
// .paddleOcrConfig(PaddleOcrConfig.builder().modelTier("server").build()) // for max accuracy
|
||||
.build())
|
||||
.build();
|
||||
|
||||
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
|
||||
System.out.println(result.getContent());
|
||||
} catch (IOException | KreuzbergException e) {
|
||||
System.err.println("Extraction failed: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user