This commit is contained in:
25
docs/snippets/go/ocr/cloud_ocr_backend.md
Normal file
25
docs/snippets/go/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// The Go binding does not currently expose plugin OCR backend registration.
|
||||
// Use one of the built-in backends ("tesseract", "paddle-ocr", or VLM via "vlm").
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
32
docs/snippets/go/ocr/image_extraction.md
Normal file
32
docs/snippets/go/ocr/image_extraction.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
extractImages := true
|
||||
injectPlaceholders := true
|
||||
autoAdjustDpi := true
|
||||
targetDpi := int32(200)
|
||||
maxDim := int32(2048)
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Images: &kreuzberg.ImageExtractionConfig{
|
||||
ExtractImages: &extractImages,
|
||||
TargetDpi: &targetDpi,
|
||||
MaxImageDimension: &maxDim,
|
||||
InjectPlaceholders: &injectPlaceholders, // set to false to extract images without markdown references
|
||||
AutoAdjustDpi: &autoAdjustDpi,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
36
docs/snippets/go/ocr/image_preprocessing.md
Normal file
36
docs/snippets/go/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
targetDpi := int32(300)
|
||||
deskew := true
|
||||
binarization := "otsu"
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
|
||||
TargetDpi: &targetDpi,
|
||||
Denoise: true,
|
||||
Deskew: &deskew,
|
||||
ContrastEnhance: true,
|
||||
BinarizationMethod: &binarization,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
1
docs/snippets/go/ocr/ocr_easyocr.md
Normal file
1
docs/snippets/go/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1 @@
|
||||
EasyOCR is only available in Python.
|
||||
34
docs/snippets/go/ocr/ocr_elements.md
Normal file
34
docs/snippets/go/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "paddle-ocr",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, element := range result.OcrElements {
|
||||
fmt.Printf("Text: %s\n", element.Text)
|
||||
fmt.Printf("Confidence: %.2f\n", element.Confidence.Recognition)
|
||||
fmt.Printf("Geometry: %+v\n", element.Geometry)
|
||||
if element.Rotation != nil {
|
||||
fmt.Printf("Rotation: %.1f°\n", element.Rotation.AngleDegrees)
|
||||
}
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_extraction.md
Normal file
24
docs/snippets/go/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println(len(result.Content))
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_force_all_pages.md
Normal file
24
docs/snippets/go/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
},
|
||||
ForceOcr: true,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println(result.Content)
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/ocr/ocr_multi_language.md
Normal file
23
docs/snippets/go/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
result, err := kreuzberg.ExtractFileSync("multilingual.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+deu+fra",
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println(result.Content)
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/ocr/ocr_paddleocr.md
Normal file
24
docs/snippets/go/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "paddle-ocr",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println(len(result.Content))
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user