This commit is contained in:
32
docs/snippets/go/plugins/clear_plugins.md
Normal file
32
docs/snippets/go/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Clear all plugins of a specific type
|
||||
if err := kreuzberg.ClearPostProcessors(); err != nil {
|
||||
log.Fatalf("clear post processors: %v", err)
|
||||
}
|
||||
log.Println("Post processors cleared")
|
||||
|
||||
if err := kreuzberg.ClearValidators(); err != nil {
|
||||
log.Fatalf("clear validators: %v", err)
|
||||
}
|
||||
log.Println("Validators cleared")
|
||||
|
||||
if err := kreuzberg.ClearOCRBackends(); err != nil {
|
||||
log.Fatalf("clear OCR backends: %v", err)
|
||||
}
|
||||
log.Println("OCR backends cleared")
|
||||
|
||||
if err := kreuzberg.ClearDocumentExtractors(); err != nil {
|
||||
log.Fatalf("clear document extractors: %v", err)
|
||||
}
|
||||
log.Println("Document extractors cleared")
|
||||
}
|
||||
```
|
||||
64
docs/snippets/go/plugins/embedding_backend.md
Normal file
64
docs/snippets/go/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,64 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// MyEmbedder wraps an already-loaded embedder so kreuzberg can call back into
|
||||
// it during chunking and standalone embed requests. Implement the
|
||||
// kreuzberg.EmbeddingBackend interface.
|
||||
type MyEmbedder struct{}
|
||||
|
||||
func (e *MyEmbedder) Name() string { return "my-embedder" }
|
||||
func (e *MyEmbedder) Version() string { return "1.0.0" }
|
||||
func (e *MyEmbedder) Initialize() error {
|
||||
// Optional warm-up; runs once at registration before Dimensions() is cached.
|
||||
return nil
|
||||
}
|
||||
func (e *MyEmbedder) Shutdown() error { return nil }
|
||||
|
||||
// Captured once at registration; the dispatcher uses this for shape validation.
|
||||
func (e *MyEmbedder) Dimensions() uint { return 768 }
|
||||
|
||||
func (e *MyEmbedder) Embed(texts []string) ([][]float32, error) {
|
||||
// Delegate to the already-loaded host model.
|
||||
out := make([][]float32, len(texts))
|
||||
for i := range texts {
|
||||
out[i] = make([]float32, 768)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register once at startup.
|
||||
if err := kreuzberg.RegisterEmbeddingBackend(&MyEmbedder{}); err != nil {
|
||||
log.Fatalf("failed to register embedding backend: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterEmbeddingBackend("my-embedder"); err != nil {
|
||||
log.Printf("warning: failed to unregister embedding backend: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
maxDuration := uint64(30)
|
||||
embedderName := "my-embedder"
|
||||
config := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Variant: "plugin",
|
||||
Type: "plugin",
|
||||
Name: &embedderName,
|
||||
},
|
||||
// Optional: bound the wait on a hung backend (default 60s; nil disables).
|
||||
MaxEmbedDurationSecs: &maxDuration,
|
||||
}
|
||||
|
||||
vectors, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Second text"}, config)
|
||||
if err != nil {
|
||||
log.Fatalf("embed failed: %v", err)
|
||||
}
|
||||
log.Printf("Generated %d vectors", len(vectors))
|
||||
}
|
||||
```
|
||||
22
docs/snippets/go/plugins/extractor_registration.md
Normal file
22
docs/snippets/go/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Register custom extractor with priority 50
|
||||
if err := kreuzberg.RegisterDocumentExtractor("custom-json-extractor", 50); err != nil {
|
||||
log.Fatalf("register extractor failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.json", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Printf("Extracted content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
52
docs/snippets/go/plugins/list_plugins.md
Normal file
52
docs/snippets/go/plugins/list_plugins.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// List all registered document extractors
|
||||
extractors, err := kreuzberg.ListDocumentExtractors()
|
||||
if err != nil {
|
||||
log.Fatalf("list document extractors: %v", err)
|
||||
}
|
||||
fmt.Println("Document Extractors:")
|
||||
for _, extractor := range extractors {
|
||||
fmt.Printf(" - %s\n", extractor)
|
||||
}
|
||||
|
||||
// List all registered post-processors
|
||||
processors, err := kreuzberg.ListPostProcessors()
|
||||
if err != nil {
|
||||
log.Fatalf("list post processors: %v", err)
|
||||
}
|
||||
fmt.Println("\nPost-Processors:")
|
||||
for _, processor := range processors {
|
||||
fmt.Printf(" - %s\n", processor)
|
||||
}
|
||||
|
||||
// List all registered OCR backends
|
||||
backends, err := kreuzberg.ListOCRBackends()
|
||||
if err != nil {
|
||||
log.Fatalf("list OCR backends: %v", err)
|
||||
}
|
||||
fmt.Println("\nOCR Backends:")
|
||||
for _, backend := range backends {
|
||||
fmt.Printf(" - %s\n", backend)
|
||||
}
|
||||
|
||||
// List all registered validators
|
||||
validators, err := kreuzberg.ListValidators()
|
||||
if err != nil {
|
||||
log.Fatalf("list validators: %v", err)
|
||||
}
|
||||
fmt.Println("\nValidators:")
|
||||
for _, validator := range validators {
|
||||
fmt.Printf(" - %s\n", validator)
|
||||
}
|
||||
}
|
||||
```
|
||||
72
docs/snippets/go/plugins/min_length_validator.md
Normal file
72
docs/snippets/go/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,72 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// minLengthConfig holds the configuration for the minimum length validator
|
||||
var minLengthConfig = struct {
|
||||
minLength int
|
||||
}{
|
||||
minLength: 100,
|
||||
}
|
||||
|
||||
// minLengthValidator validates that extracted content meets minimum length requirement
|
||||
//export minLengthValidator
|
||||
func minLengthValidator(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse result JSON")
|
||||
}
|
||||
|
||||
content, ok := result["content"].(string)
|
||||
if !ok {
|
||||
return C.CString("Missing content field in result")
|
||||
}
|
||||
|
||||
if len(content) < minLengthConfig.minLength {
|
||||
errMsg := fmt.Sprintf("Content too short: %d < %d", len(content), minLengthConfig.minLength)
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
// Validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the validator with priority 100 (runs early)
|
||||
if err := kreuzberg.RegisterValidator("min_length_validator", 100,
|
||||
(C.ValidatorCallback)(C.minLengthValidator)); err != nil {
|
||||
log.Fatalf("failed to register validator: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
|
||||
log.Printf("warning: failed to unregister validator: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract and validate
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Validation passed. Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
114
docs/snippets/go/plugins/pdf_metadata_extractor.md
Normal file
114
docs/snippets/go/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,114 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"sync/atomic"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// pdfMetadataState tracks statistics about PDF processing
|
||||
var pdfMetadataState = struct {
|
||||
processedCount int64
|
||||
}{
|
||||
processedCount: 0,
|
||||
}
|
||||
|
||||
// pdfMetadataExtractor enriches PDF extraction results with additional metadata
|
||||
//export pdfMetadataExtractor
|
||||
func pdfMetadataExtractor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Only process PDFs
|
||||
mimeType, ok := result["mime_type"].(string)
|
||||
if !ok || mimeType != "application/pdf" {
|
||||
// Return unchanged for non-PDF documents
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// Process PDF-specific metadata
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Mark as processed by this processor
|
||||
metadata["pdf_processed"] = true
|
||||
|
||||
// Add content statistics
|
||||
content, ok := result["content"].(string)
|
||||
if ok {
|
||||
metadata["content_length"] = len(content)
|
||||
}
|
||||
|
||||
// Increment processed count atomically
|
||||
atomic.AddInt64(&pdfMetadataState.processedCount, 1)
|
||||
metadata["pdf_processor_version"] = "1.0.0"
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 80, early stage
|
||||
if err := kreuzberg.RegisterPostProcessor("pdf_metadata_extractor", 80,
|
||||
(C.PostProcessorCallback)(C.pdfMetadataExtractor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("pdf_metadata_extractor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Total PDFs processed: %d", atomic.LoadInt64(&pdfMetadataState.processedCount))
|
||||
}()
|
||||
|
||||
// Extract PDF document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("PDF MIME type: %s", result.MimeType)
|
||||
|
||||
// Parse and display metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if pdfProcessed, ok := metadata["pdf_processed"].(bool); ok && pdfProcessed {
|
||||
log.Printf("PDF metadata extracted successfully")
|
||||
if contentLen, ok := metadata["content_length"].(float64); ok {
|
||||
log.Printf("Content length: %.0f bytes", contentLen)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
116
docs/snippets/go/plugins/pdf_only_processor.md
Normal file
116
docs/snippets/go/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,116 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// pdfOnlyProcessor applies PDF-specific processing logic only to PDF documents
|
||||
//export pdfOnlyProcessor
|
||||
func pdfOnlyProcessor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Check MIME type - only process PDFs
|
||||
mimeType, ok := result["mime_type"].(string)
|
||||
if !ok || mimeType != "application/pdf" {
|
||||
// Return unchanged for non-PDF documents
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// Perform PDF-specific processing
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Example PDF-specific processing:
|
||||
// - Extract tables as structured data
|
||||
// - Handle PDF-specific formatting
|
||||
// - Preserve document hierarchy
|
||||
|
||||
metadata["pdf_specific_processing"] = true
|
||||
metadata["processor_type"] = "pdf_only"
|
||||
|
||||
// Check for tables in PDF
|
||||
if tablesJSON, ok := result["tables_json"].(string); ok && tablesJSON != "" {
|
||||
var tables []interface{}
|
||||
if err := json.Unmarshal([]byte(tablesJSON), &tables); err == nil {
|
||||
metadata["table_count"] = len(tables)
|
||||
}
|
||||
}
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 70
|
||||
if err := kreuzberg.RegisterPostProcessor("pdf_only_processor", 70,
|
||||
(C.PostProcessorCallback)(C.pdfOnlyProcessor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("pdf_only_processor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Process multiple documents - processor will only affect PDFs
|
||||
files := []string{
|
||||
"document.pdf",
|
||||
"image.jpg",
|
||||
"spreadsheet.xlsx",
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
result, err := kreuzberg.ExtractFileSync(file, nil)
|
||||
if err != nil {
|
||||
log.Printf("Warning: extraction failed for %s: %v", file, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse metadata to check if PDF processing occurred
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if pdfProcessing, ok := metadata["pdf_specific_processing"].(bool); ok && pdfProcessing {
|
||||
log.Printf("PDF-specific processing applied to: %s", file)
|
||||
if tableCount, ok := metadata["table_count"].(float64); ok {
|
||||
log.Printf(" Tables found: %.0f", tableCount)
|
||||
}
|
||||
} else {
|
||||
log.Printf("Skipped PDF processor for: %s (MIME: %s)", file, result.MimeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
13
docs/snippets/go/plugins/plugin_extractor.md
Normal file
13
docs/snippets/go/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,13 @@
|
||||
<!-- snippet:skip reason="Go bindings do not support custom document extractor plugins" -->
|
||||
```markdown title="Markdown"
|
||||
!!! note "Not Supported"
|
||||
The Go binding is a thin CGO wrapper and does not currently support
|
||||
custom document extractors. Custom plugins must be implemented in Rust.
|
||||
|
||||
See the [Rust plugin documentation](../../rust/plugins/plugin_extractor.md) for details on creating custom document extractors.
|
||||
|
||||
Go currently supports:
|
||||
- **PostProcessor** - Transform extraction results
|
||||
- **Validator** - Validate extraction results
|
||||
- **OcrBackend** - Custom OCR implementations
|
||||
```
|
||||
92
docs/snippets/go/plugins/plugin_logging.md
Normal file
92
docs/snippets/go/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,92 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"encoding/json"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
//export loggingPostProcessor
|
||||
func loggingPostProcessor(resultJSON *C.char) *C.char {
|
||||
log.Println("[PostProcessor] Processing extraction result")
|
||||
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
log.Printf("[PostProcessor] Error parsing result: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
log.Printf("[PostProcessor] Content length: %d bytes", len(content))
|
||||
if len(content) == 0 {
|
||||
log.Println("[PostProcessor] Warning: Empty content extracted")
|
||||
}
|
||||
}
|
||||
|
||||
if mimeType, ok := result["mime_type"].(string); ok {
|
||||
log.Printf("[PostProcessor] Processing %s", mimeType)
|
||||
}
|
||||
|
||||
// Return NULL to indicate success (no modification)
|
||||
return nil
|
||||
}
|
||||
|
||||
//export loggingValidator
|
||||
func loggingValidator(resultJSON *C.char) *C.char {
|
||||
log.Println("[Validator] Validating extraction result")
|
||||
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
log.Printf("[Validator] Error parsing result: %v", err)
|
||||
errMsg := "Failed to parse validation input"
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
log.Printf("[Validator] Content length: %d bytes", len(content))
|
||||
if len(content) < 50 {
|
||||
log.Println("[Validator] Error: Content below minimum threshold")
|
||||
errMsg := "Content too short (minimum 50 characters)"
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Return NULL to indicate validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register post processor with logging
|
||||
if err := kreuzberg.RegisterPostProcessor(
|
||||
"logging-processor",
|
||||
100, // priority
|
||||
(C.PostProcessorCallback)(C.loggingPostProcessor),
|
||||
); err != nil {
|
||||
log.Fatalf("register post processor failed: %v", err)
|
||||
}
|
||||
log.Println("[Main] PostProcessor registered with logging enabled")
|
||||
|
||||
// Register validator with logging
|
||||
if err := kreuzberg.RegisterValidator(
|
||||
"logging-validator",
|
||||
50, // priority
|
||||
(C.ValidatorCallback)(C.loggingValidator),
|
||||
); err != nil {
|
||||
log.Fatalf("register validator failed: %v", err)
|
||||
}
|
||||
log.Println("[Main] Validator registered with logging enabled")
|
||||
|
||||
// Extract with logging
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("[Main] Extraction complete: %d bytes content", len(result.Content))
|
||||
}
|
||||
```
|
||||
213
docs/snippets/go/plugins/plugin_testing.md
Normal file
213
docs/snippets/go/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,213 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"C"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"testing"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
// TestPostProcessor tests custom post processor behavior
|
||||
func TestPostProcessor(t *testing.T) {
|
||||
// Create a post processor that adds metadata
|
||||
metricsMap := make(map[string]int64)
|
||||
|
||||
//export testPostProcessor
|
||||
testPostProcessor := func(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
metricsMap["content_length"] = int64(len(content))
|
||||
metricsMap["processed"] = 1
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register the processor
|
||||
err := kreuzberg.RegisterPostProcessor(
|
||||
"test-processor",
|
||||
10,
|
||||
(C.PostProcessorCallback)(C.testPostProcessor),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register post processor: %v", err)
|
||||
}
|
||||
|
||||
// Simulate a mock result
|
||||
mockResult := map[string]interface{}{
|
||||
"content": "Test extraction content",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
resultJSON, err := json.Marshal(mockResult)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal mock result: %v", err)
|
||||
}
|
||||
cResultJSON := C.CString(string(resultJSON))
|
||||
defer C.free(unsafe.Pointer(cResultJSON))
|
||||
|
||||
// Call the processor
|
||||
testPostProcessor(cResultJSON)
|
||||
|
||||
// Verify metrics were recorded
|
||||
if metricsMap["content_length"] != 22 {
|
||||
t.Errorf("Expected content_length 22, got %d", metricsMap["content_length"])
|
||||
}
|
||||
if metricsMap["processed"] != 1 {
|
||||
t.Errorf("Expected processed flag to be 1")
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterPostProcessor("test-processor")
|
||||
}
|
||||
|
||||
// TestValidator tests custom validator behavior
|
||||
func TestValidator(t *testing.T) {
|
||||
validatorCalled := false
|
||||
|
||||
//export testValidator
|
||||
testValidator := func(resultJSON *C.char) *C.char {
|
||||
validatorCalled = true
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse validation input")
|
||||
}
|
||||
|
||||
if content, ok := result["content"].(string); ok {
|
||||
if len(content) < 10 {
|
||||
return C.CString("Content too short")
|
||||
}
|
||||
}
|
||||
|
||||
return nil // Success
|
||||
}
|
||||
|
||||
// Register the validator
|
||||
err := kreuzberg.RegisterValidator(
|
||||
"test-validator",
|
||||
50,
|
||||
(C.ValidatorCallback)(C.testValidator),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register validator: %v", err)
|
||||
}
|
||||
|
||||
// Test 1: Valid content
|
||||
validContent := map[string]interface{}{
|
||||
"content": "This is valid content",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
validJSON, err := json.Marshal(validContent)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal valid content: %v", err)
|
||||
}
|
||||
cValidJSON := C.CString(string(validJSON))
|
||||
defer C.free(unsafe.Pointer(cValidJSON))
|
||||
|
||||
result := testValidator(cValidJSON)
|
||||
if result != nil {
|
||||
t.Errorf("Expected nil (success), got error: %s", C.GoString(result))
|
||||
}
|
||||
|
||||
if !validatorCalled {
|
||||
t.Errorf("Validator was not called")
|
||||
}
|
||||
|
||||
// Test 2: Invalid content (too short)
|
||||
invalidContent := map[string]interface{}{
|
||||
"content": "Short",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": map[string]interface{}{},
|
||||
"tables": []interface{}{},
|
||||
"detected_languages": []interface{}{},
|
||||
}
|
||||
|
||||
invalidJSON, err := json.Marshal(invalidContent)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to marshal invalid content: %v", err)
|
||||
}
|
||||
cInvalidJSON := C.CString(string(invalidJSON))
|
||||
defer C.free(unsafe.Pointer(cInvalidJSON))
|
||||
|
||||
result = testValidator(cInvalidJSON)
|
||||
if result == nil {
|
||||
t.Errorf("Expected error for short content, got nil")
|
||||
} else {
|
||||
errorMsg := C.GoString(result)
|
||||
if errorMsg != "Content too short" {
|
||||
t.Errorf("Expected 'Content too short', got: %s", errorMsg)
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterValidator("test-validator")
|
||||
}
|
||||
|
||||
// TestValidatorIntegration tests validator with actual extraction
|
||||
func TestValidatorIntegration(t *testing.T) {
|
||||
//export integrationValidator
|
||||
integrationValidator := func(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString(fmt.Sprintf("Parse error: %v", err))
|
||||
}
|
||||
|
||||
// Validate that mime_type is set
|
||||
if _, ok := result["mime_type"]; !ok {
|
||||
return C.CString("Missing mime_type in result")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register validator
|
||||
err := kreuzberg.RegisterValidator(
|
||||
"integration-validator",
|
||||
100,
|
||||
(C.ValidatorCallback)(C.integrationValidator),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to register validator: %v", err)
|
||||
}
|
||||
|
||||
// The validator will be called automatically during extraction
|
||||
// This test verifies the registration was successful
|
||||
validators, err := kreuzberg.ListValidators()
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to list validators: %v", err)
|
||||
}
|
||||
|
||||
found := false
|
||||
for _, v := range validators {
|
||||
if v == "integration-validator" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
t.Errorf("Validator not found in registered validators list")
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
_ = kreuzberg.UnregisterValidator("integration-validator")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/go/plugins/plugin_validator.md
Normal file
35
docs/snippets/go/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
//export customValidator
|
||||
func customValidator(resultJSON *C.char) *C.char {
|
||||
// Inspect resultJSON, return error message or NULL
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := kreuzberg.RegisterValidator("go-validator", 50, (C.ValidatorCallback)(C.customValidator)); err != nil {
|
||||
log.Fatalf("register validator failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
77
docs/snippets/go/plugins/quality_score_validator.md
Normal file
77
docs/snippets/go/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,77 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// qualityThreshold is the minimum acceptable quality score
|
||||
const qualityThreshold = 0.5
|
||||
|
||||
// qualityScoreValidator validates that extraction quality meets minimum threshold
|
||||
//export qualityScoreValidator
|
||||
func qualityScoreValidator(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("Failed to parse result JSON")
|
||||
}
|
||||
|
||||
// Extract metadata object
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
// No metadata is not an error, just skip quality check
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get quality score from result
|
||||
qualityScore := 0.0
|
||||
if score, ok := result["quality_score"].(float64); ok {
|
||||
qualityScore = score
|
||||
}
|
||||
|
||||
// Validate against threshold
|
||||
if qualityScore < qualityThreshold {
|
||||
errMsg := fmt.Sprintf("Quality score too low: %.0f%% < %.0f%%", qualityScore*100, qualityThreshold*100)
|
||||
return C.CString(errMsg)
|
||||
}
|
||||
|
||||
// Validation passed
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the validator with priority 50
|
||||
if err := kreuzberg.RegisterValidator("quality_score_validator", 50,
|
||||
(C.ValidatorCallback)(C.qualityScoreValidator)); err != nil {
|
||||
log.Fatalf("failed to register validator: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterValidator("quality_score_validator"); err != nil {
|
||||
log.Printf("warning: failed to unregister validator: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract and validate
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction or validation failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Quality validation passed for: %s", result.MimeType)
|
||||
}
|
||||
```
|
||||
165
docs/snippets/go/plugins/stateful_plugin.md
Normal file
165
docs/snippets/go/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,165 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// PluginState manages thread-safe state for the stateful plugin
|
||||
type PluginState struct {
|
||||
mu sync.Mutex
|
||||
callCount int
|
||||
cache map[string]string
|
||||
lastMimeType string
|
||||
}
|
||||
|
||||
// globalState holds the plugin's persistent state across calls
|
||||
var globalState = &PluginState{
|
||||
cache: make(map[string]string),
|
||||
}
|
||||
|
||||
// statefulPlugin demonstrates a thread-safe plugin with persistent state
|
||||
//export statefulPlugin
|
||||
func statefulPlugin(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Acquire lock to safely modify state
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
// Increment call counter
|
||||
globalState.callCount++
|
||||
|
||||
// Extract and store MIME type
|
||||
if mimeType, ok := result["mime_type"].(string); ok {
|
||||
globalState.lastMimeType = mimeType
|
||||
globalState.cache[mimeType] = "processed"
|
||||
}
|
||||
|
||||
// Ensure metadata exists
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Add state information to metadata
|
||||
metadata["plugin_call_count"] = globalState.callCount
|
||||
metadata["last_mime_type"] = globalState.lastMimeType
|
||||
metadata["cached_types_count"] = len(globalState.cache)
|
||||
metadata["plugin_version"] = "1.0.0"
|
||||
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
// GetPluginStats safely retrieves the current plugin state for logging
|
||||
func GetPluginStats() (int, string, []string) {
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
callCount := globalState.callCount
|
||||
lastMime := globalState.lastMimeType
|
||||
|
||||
mimeTypes := make([]string, 0, len(globalState.cache))
|
||||
for mimeType := range globalState.cache {
|
||||
mimeTypes = append(mimeTypes, mimeType)
|
||||
}
|
||||
|
||||
return callCount, lastMime, mimeTypes
|
||||
}
|
||||
|
||||
// ResetPluginState clears the plugin state - useful for testing
|
||||
func ResetPluginState() {
|
||||
globalState.mu.Lock()
|
||||
defer globalState.mu.Unlock()
|
||||
|
||||
globalState.callCount = 0
|
||||
globalState.lastMimeType = ""
|
||||
globalState.cache = make(map[string]string)
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the stateful post-processor with priority 60
|
||||
if err := kreuzberg.RegisterPostProcessor("stateful_plugin", 60,
|
||||
(C.PostProcessorCallback)(C.statefulPlugin)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("stateful_plugin"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
|
||||
// Print final statistics
|
||||
callCount, lastMime, mimeTypes := GetPluginStats()
|
||||
log.Printf("Plugin Statistics:")
|
||||
log.Printf(" Total calls: %d", callCount)
|
||||
log.Printf(" Last MIME type: %s", lastMime)
|
||||
log.Printf(" Unique MIME types processed: %d", len(mimeTypes))
|
||||
if len(mimeTypes) > 0 {
|
||||
log.Printf(" Processed types: %v", mimeTypes)
|
||||
}
|
||||
}()
|
||||
|
||||
// Process multiple documents to demonstrate state accumulation
|
||||
files := []string{
|
||||
"document1.pdf",
|
||||
"document2.pdf",
|
||||
"image.png",
|
||||
"document3.txt",
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
log.Printf("Processing: %s", file)
|
||||
result, err := kreuzberg.ExtractFileSync(file, nil)
|
||||
if err != nil {
|
||||
log.Printf(" Warning: extraction failed: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse and display metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if callCount, ok := metadata["plugin_call_count"].(float64); ok {
|
||||
log.Printf(" Plugin call count: %.0f", callCount)
|
||||
}
|
||||
if cachedCount, ok := metadata["cached_types_count"].(float64); ok {
|
||||
log.Printf(" Cached MIME types: %.0f", cachedCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Demonstrate thread-safe state access
|
||||
callCount, lastMime, mimeTypes := GetPluginStats()
|
||||
log.Printf("\nFinal Plugin State:")
|
||||
log.Printf(" Total calls: %d", callCount)
|
||||
log.Printf(" Last MIME type: %s", lastMime)
|
||||
log.Printf(" Processed MIME types: %v", mimeTypes)
|
||||
}
|
||||
```
|
||||
55
docs/snippets/go/plugins/unregister_plugins.md
Normal file
55
docs/snippets/go/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Unregister a specific document extractor
|
||||
if err := kreuzberg.UnregisterDocumentExtractor("custom-json-extractor"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister document extractor: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific post-processor
|
||||
if err := kreuzberg.UnregisterPostProcessor("word_count"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister post processor: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific OCR backend
|
||||
if err := kreuzberg.UnregisterOCRBackend("cloud-ocr"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister OCR backend: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Unregister a specific validator
|
||||
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
|
||||
var validErr *kreuzberg.ValidationError
|
||||
if errors.As(err, &validErr) {
|
||||
log.Printf("validation error: %v", err)
|
||||
} else {
|
||||
log.Fatalf("unregister validator: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Plugins unregistered successfully")
|
||||
}
|
||||
```
|
||||
90
docs/snippets/go/plugins/word_count_processor.md
Normal file
90
docs/snippets/go/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,90 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
|
||||
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
|
||||
#include <stdlib.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
// wordCountProcessor adds word count metadata to extraction results
|
||||
//export wordCountProcessor
|
||||
func wordCountProcessor(resultJSON *C.char) *C.char {
|
||||
jsonStr := C.GoString(resultJSON)
|
||||
var result map[string]interface{}
|
||||
|
||||
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
|
||||
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
|
||||
}
|
||||
|
||||
// Extract content
|
||||
content, ok := result["content"].(string)
|
||||
if !ok {
|
||||
return C.CString("{\"error\":\"Missing content field\"}")
|
||||
}
|
||||
|
||||
// Count words by splitting on whitespace
|
||||
words := strings.Fields(content)
|
||||
wordCount := len(words)
|
||||
|
||||
// Ensure metadata exists
|
||||
metadata, ok := result["metadata"].(map[string]interface{})
|
||||
if !ok {
|
||||
metadata = make(map[string]interface{})
|
||||
}
|
||||
|
||||
// Add word count to metadata
|
||||
metadata["word_count"] = wordCount
|
||||
|
||||
// Update result
|
||||
result["metadata"] = metadata
|
||||
|
||||
// Serialize back to JSON
|
||||
outputJSON, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return C.CString("{\"error\":\"Failed to serialize result\"}")
|
||||
}
|
||||
|
||||
return C.CString(string(outputJSON))
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the post-processor with priority 100, early stage
|
||||
if err := kreuzberg.RegisterPostProcessor("word_count_processor", 100,
|
||||
(C.PostProcessorCallback)(C.wordCountProcessor)); err != nil {
|
||||
log.Fatalf("failed to register post-processor: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := kreuzberg.UnregisterPostProcessor("word_count_processor"); err != nil {
|
||||
log.Printf("warning: failed to unregister post-processor: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Extract document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("extraction failed: %v", err)
|
||||
}
|
||||
|
||||
// Access word count from metadata
|
||||
var metadata map[string]interface{}
|
||||
if metaJSON, ok := result.MetadataJSON.(string); ok {
|
||||
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
|
||||
if wordCount, ok := metadata["word_count"].(float64); ok {
|
||||
log.Printf("Word count: %.0f", wordCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user