829 B
829 B
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
psm := int32(6)
oem := int32(1)
enableTableDetection := true
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+fra+deu",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
Oem: &oem,
MinConfidence: 0.8,
EnableTableDetection: &enableTableDetection,
TesseditCharWhitelist: whitelist,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}