Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,32 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Clear all plugins of a specific type
if err := kreuzberg.ClearPostProcessors(); err != nil {
log.Fatalf("clear post processors: %v", err)
}
log.Println("Post processors cleared")
if err := kreuzberg.ClearValidators(); err != nil {
log.Fatalf("clear validators: %v", err)
}
log.Println("Validators cleared")
if err := kreuzberg.ClearOCRBackends(); err != nil {
log.Fatalf("clear OCR backends: %v", err)
}
log.Println("OCR backends cleared")
if err := kreuzberg.ClearDocumentExtractors(); err != nil {
log.Fatalf("clear document extractors: %v", err)
}
log.Println("Document extractors cleared")
}
```

View File

@@ -0,0 +1,64 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
// MyEmbedder wraps an already-loaded embedder so kreuzberg can call back into
// it during chunking and standalone embed requests. Implement the
// kreuzberg.EmbeddingBackend interface.
type MyEmbedder struct{}
func (e *MyEmbedder) Name() string { return "my-embedder" }
func (e *MyEmbedder) Version() string { return "1.0.0" }
func (e *MyEmbedder) Initialize() error {
// Optional warm-up; runs once at registration before Dimensions() is cached.
return nil
}
func (e *MyEmbedder) Shutdown() error { return nil }
// Captured once at registration; the dispatcher uses this for shape validation.
func (e *MyEmbedder) Dimensions() uint { return 768 }
func (e *MyEmbedder) Embed(texts []string) ([][]float32, error) {
// Delegate to the already-loaded host model.
out := make([][]float32, len(texts))
for i := range texts {
out[i] = make([]float32, 768)
}
return out, nil
}
func main() {
// Register once at startup.
if err := kreuzberg.RegisterEmbeddingBackend(&MyEmbedder{}); err != nil {
log.Fatalf("failed to register embedding backend: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterEmbeddingBackend("my-embedder"); err != nil {
log.Printf("warning: failed to unregister embedding backend: %v", err)
}
}()
maxDuration := uint64(30)
embedderName := "my-embedder"
config := kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Variant: "plugin",
Type: "plugin",
Name: &embedderName,
},
// Optional: bound the wait on a hung backend (default 60s; nil disables).
MaxEmbedDurationSecs: &maxDuration,
}
vectors, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Second text"}, config)
if err != nil {
log.Fatalf("embed failed: %v", err)
}
log.Printf("Generated %d vectors", len(vectors))
}
```

View File

@@ -0,0 +1,22 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Register custom extractor with priority 50
if err := kreuzberg.RegisterDocumentExtractor("custom-json-extractor", 50); err != nil {
log.Fatalf("register extractor failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.json", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Extracted content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,52 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// List all registered document extractors
extractors, err := kreuzberg.ListDocumentExtractors()
if err != nil {
log.Fatalf("list document extractors: %v", err)
}
fmt.Println("Document Extractors:")
for _, extractor := range extractors {
fmt.Printf(" - %s\n", extractor)
}
// List all registered post-processors
processors, err := kreuzberg.ListPostProcessors()
if err != nil {
log.Fatalf("list post processors: %v", err)
}
fmt.Println("\nPost-Processors:")
for _, processor := range processors {
fmt.Printf(" - %s\n", processor)
}
// List all registered OCR backends
backends, err := kreuzberg.ListOCRBackends()
if err != nil {
log.Fatalf("list OCR backends: %v", err)
}
fmt.Println("\nOCR Backends:")
for _, backend := range backends {
fmt.Printf(" - %s\n", backend)
}
// List all registered validators
validators, err := kreuzberg.ListValidators()
if err != nil {
log.Fatalf("list validators: %v", err)
}
fmt.Println("\nValidators:")
for _, validator := range validators {
fmt.Printf(" - %s\n", validator)
}
}
```

View File

@@ -0,0 +1,72 @@
```go title="Go"
package main
import (
"encoding/json"
"fmt"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// minLengthConfig holds the configuration for the minimum length validator
var minLengthConfig = struct {
minLength int
}{
minLength: 100,
}
// minLengthValidator validates that extracted content meets minimum length requirement
//export minLengthValidator
func minLengthValidator(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse result JSON")
}
content, ok := result["content"].(string)
if !ok {
return C.CString("Missing content field in result")
}
if len(content) < minLengthConfig.minLength {
errMsg := fmt.Sprintf("Content too short: %d < %d", len(content), minLengthConfig.minLength)
return C.CString(errMsg)
}
// Validation passed
return nil
}
func main() {
// Register the validator with priority 100 (runs early)
if err := kreuzberg.RegisterValidator("min_length_validator", 100,
(C.ValidatorCallback)(C.minLengthValidator)); err != nil {
log.Fatalf("failed to register validator: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
log.Printf("warning: failed to unregister validator: %v", err)
}
}()
// Extract and validate
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
log.Printf("Validation passed. Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,114 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"sync/atomic"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// pdfMetadataState tracks statistics about PDF processing
var pdfMetadataState = struct {
processedCount int64
}{
processedCount: 0,
}
// pdfMetadataExtractor enriches PDF extraction results with additional metadata
//export pdfMetadataExtractor
func pdfMetadataExtractor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Only process PDFs
mimeType, ok := result["mime_type"].(string)
if !ok || mimeType != "application/pdf" {
// Return unchanged for non-PDF documents
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// Process PDF-specific metadata
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Mark as processed by this processor
metadata["pdf_processed"] = true
// Add content statistics
content, ok := result["content"].(string)
if ok {
metadata["content_length"] = len(content)
}
// Increment processed count atomically
atomic.AddInt64(&pdfMetadataState.processedCount, 1)
metadata["pdf_processor_version"] = "1.0.0"
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 80, early stage
if err := kreuzberg.RegisterPostProcessor("pdf_metadata_extractor", 80,
(C.PostProcessorCallback)(C.pdfMetadataExtractor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("pdf_metadata_extractor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
log.Printf("Total PDFs processed: %d", atomic.LoadInt64(&pdfMetadataState.processedCount))
}()
// Extract PDF document
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
log.Printf("PDF MIME type: %s", result.MimeType)
// Parse and display metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if pdfProcessed, ok := metadata["pdf_processed"].(bool); ok && pdfProcessed {
log.Printf("PDF metadata extracted successfully")
if contentLen, ok := metadata["content_length"].(float64); ok {
log.Printf("Content length: %.0f bytes", contentLen)
}
}
}
}
}
```

View File

@@ -0,0 +1,116 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// pdfOnlyProcessor applies PDF-specific processing logic only to PDF documents
//export pdfOnlyProcessor
func pdfOnlyProcessor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Check MIME type - only process PDFs
mimeType, ok := result["mime_type"].(string)
if !ok || mimeType != "application/pdf" {
// Return unchanged for non-PDF documents
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// Perform PDF-specific processing
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Example PDF-specific processing:
// - Extract tables as structured data
// - Handle PDF-specific formatting
// - Preserve document hierarchy
metadata["pdf_specific_processing"] = true
metadata["processor_type"] = "pdf_only"
// Check for tables in PDF
if tablesJSON, ok := result["tables_json"].(string); ok && tablesJSON != "" {
var tables []interface{}
if err := json.Unmarshal([]byte(tablesJSON), &tables); err == nil {
metadata["table_count"] = len(tables)
}
}
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 70
if err := kreuzberg.RegisterPostProcessor("pdf_only_processor", 70,
(C.PostProcessorCallback)(C.pdfOnlyProcessor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("pdf_only_processor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
}()
// Process multiple documents - processor will only affect PDFs
files := []string{
"document.pdf",
"image.jpg",
"spreadsheet.xlsx",
}
for _, file := range files {
result, err := kreuzberg.ExtractFileSync(file, nil)
if err != nil {
log.Printf("Warning: extraction failed for %s: %v", file, err)
continue
}
// Parse metadata to check if PDF processing occurred
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if pdfProcessing, ok := metadata["pdf_specific_processing"].(bool); ok && pdfProcessing {
log.Printf("PDF-specific processing applied to: %s", file)
if tableCount, ok := metadata["table_count"].(float64); ok {
log.Printf(" Tables found: %.0f", tableCount)
}
} else {
log.Printf("Skipped PDF processor for: %s (MIME: %s)", file, result.MimeType)
}
}
}
}
}
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="Go bindings do not support custom document extractor plugins" -->
```markdown title="Markdown"
!!! note "Not Supported"
The Go binding is a thin CGO wrapper and does not currently support
custom document extractors. Custom plugins must be implemented in Rust.
See the [Rust plugin documentation](../../rust/plugins/plugin_extractor.md) for details on creating custom document extractors.
Go currently supports:
- **PostProcessor** - Transform extraction results
- **Validator** - Validate extraction results
- **OcrBackend** - Custom OCR implementations
```

View File

@@ -0,0 +1,92 @@
```go title="Go"
package main
import (
"C"
"encoding/json"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
//export loggingPostProcessor
func loggingPostProcessor(resultJSON *C.char) *C.char {
log.Println("[PostProcessor] Processing extraction result")
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
log.Printf("[PostProcessor] Error parsing result: %v", err)
return nil
}
if content, ok := result["content"].(string); ok {
log.Printf("[PostProcessor] Content length: %d bytes", len(content))
if len(content) == 0 {
log.Println("[PostProcessor] Warning: Empty content extracted")
}
}
if mimeType, ok := result["mime_type"].(string); ok {
log.Printf("[PostProcessor] Processing %s", mimeType)
}
// Return NULL to indicate success (no modification)
return nil
}
//export loggingValidator
func loggingValidator(resultJSON *C.char) *C.char {
log.Println("[Validator] Validating extraction result")
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
log.Printf("[Validator] Error parsing result: %v", err)
errMsg := "Failed to parse validation input"
return C.CString(errMsg)
}
if content, ok := result["content"].(string); ok {
log.Printf("[Validator] Content length: %d bytes", len(content))
if len(content) < 50 {
log.Println("[Validator] Error: Content below minimum threshold")
errMsg := "Content too short (minimum 50 characters)"
return C.CString(errMsg)
}
}
// Return NULL to indicate validation passed
return nil
}
func main() {
// Register post processor with logging
if err := kreuzberg.RegisterPostProcessor(
"logging-processor",
100, // priority
(C.PostProcessorCallback)(C.loggingPostProcessor),
); err != nil {
log.Fatalf("register post processor failed: %v", err)
}
log.Println("[Main] PostProcessor registered with logging enabled")
// Register validator with logging
if err := kreuzberg.RegisterValidator(
"logging-validator",
50, // priority
(C.ValidatorCallback)(C.loggingValidator),
); err != nil {
log.Fatalf("register validator failed: %v", err)
}
log.Println("[Main] Validator registered with logging enabled")
// Extract with logging
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("[Main] Extraction complete: %d bytes content", len(result.Content))
}
```

View File

@@ -0,0 +1,213 @@
```go title="Go"
package main
import (
"C"
"encoding/json"
"fmt"
"testing"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
// TestPostProcessor tests custom post processor behavior
func TestPostProcessor(t *testing.T) {
// Create a post processor that adds metadata
metricsMap := make(map[string]int64)
//export testPostProcessor
testPostProcessor := func(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return nil
}
if content, ok := result["content"].(string); ok {
metricsMap["content_length"] = int64(len(content))
metricsMap["processed"] = 1
}
return nil
}
// Register the processor
err := kreuzberg.RegisterPostProcessor(
"test-processor",
10,
(C.PostProcessorCallback)(C.testPostProcessor),
)
if err != nil {
t.Fatalf("Failed to register post processor: %v", err)
}
// Simulate a mock result
mockResult := map[string]interface{}{
"content": "Test extraction content",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
resultJSON, err := json.Marshal(mockResult)
if err != nil {
t.Fatalf("Failed to marshal mock result: %v", err)
}
cResultJSON := C.CString(string(resultJSON))
defer C.free(unsafe.Pointer(cResultJSON))
// Call the processor
testPostProcessor(cResultJSON)
// Verify metrics were recorded
if metricsMap["content_length"] != 22 {
t.Errorf("Expected content_length 22, got %d", metricsMap["content_length"])
}
if metricsMap["processed"] != 1 {
t.Errorf("Expected processed flag to be 1")
}
// Cleanup
_ = kreuzberg.UnregisterPostProcessor("test-processor")
}
// TestValidator tests custom validator behavior
func TestValidator(t *testing.T) {
validatorCalled := false
//export testValidator
testValidator := func(resultJSON *C.char) *C.char {
validatorCalled = true
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse validation input")
}
if content, ok := result["content"].(string); ok {
if len(content) < 10 {
return C.CString("Content too short")
}
}
return nil // Success
}
// Register the validator
err := kreuzberg.RegisterValidator(
"test-validator",
50,
(C.ValidatorCallback)(C.testValidator),
)
if err != nil {
t.Fatalf("Failed to register validator: %v", err)
}
// Test 1: Valid content
validContent := map[string]interface{}{
"content": "This is valid content",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
validJSON, err := json.Marshal(validContent)
if err != nil {
t.Fatalf("Failed to marshal valid content: %v", err)
}
cValidJSON := C.CString(string(validJSON))
defer C.free(unsafe.Pointer(cValidJSON))
result := testValidator(cValidJSON)
if result != nil {
t.Errorf("Expected nil (success), got error: %s", C.GoString(result))
}
if !validatorCalled {
t.Errorf("Validator was not called")
}
// Test 2: Invalid content (too short)
invalidContent := map[string]interface{}{
"content": "Short",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
invalidJSON, err := json.Marshal(invalidContent)
if err != nil {
t.Fatalf("Failed to marshal invalid content: %v", err)
}
cInvalidJSON := C.CString(string(invalidJSON))
defer C.free(unsafe.Pointer(cInvalidJSON))
result = testValidator(cInvalidJSON)
if result == nil {
t.Errorf("Expected error for short content, got nil")
} else {
errorMsg := C.GoString(result)
if errorMsg != "Content too short" {
t.Errorf("Expected 'Content too short', got: %s", errorMsg)
}
}
// Cleanup
_ = kreuzberg.UnregisterValidator("test-validator")
}
// TestValidatorIntegration tests validator with actual extraction
func TestValidatorIntegration(t *testing.T) {
//export integrationValidator
integrationValidator := func(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString(fmt.Sprintf("Parse error: %v", err))
}
// Validate that mime_type is set
if _, ok := result["mime_type"]; !ok {
return C.CString("Missing mime_type in result")
}
return nil
}
// Register validator
err := kreuzberg.RegisterValidator(
"integration-validator",
100,
(C.ValidatorCallback)(C.integrationValidator),
)
if err != nil {
t.Fatalf("Failed to register validator: %v", err)
}
// The validator will be called automatically during extraction
// This test verifies the registration was successful
validators, err := kreuzberg.ListValidators()
if err != nil {
t.Fatalf("Failed to list validators: %v", err)
}
found := false
for _, v := range validators {
if v == "integration-validator" {
found = true
break
}
}
if !found {
t.Errorf("Validator not found in registered validators list")
}
// Cleanup
_ = kreuzberg.UnregisterValidator("integration-validator")
}
```

View File

@@ -0,0 +1,35 @@
```go title="Go"
package main
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
import (
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
//export customValidator
func customValidator(resultJSON *C.char) *C.char {
// Inspect resultJSON, return error message or NULL
return nil
}
func main() {
if err := kreuzberg.RegisterValidator("go-validator", 50, (C.ValidatorCallback)(C.customValidator)); err != nil {
log.Fatalf("register validator failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,77 @@
```go title="Go"
package main
import (
"encoding/json"
"fmt"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// qualityThreshold is the minimum acceptable quality score
const qualityThreshold = 0.5
// qualityScoreValidator validates that extraction quality meets minimum threshold
//export qualityScoreValidator
func qualityScoreValidator(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse result JSON")
}
// Extract metadata object
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
// No metadata is not an error, just skip quality check
return nil
}
// Get quality score from result
qualityScore := 0.0
if score, ok := result["quality_score"].(float64); ok {
qualityScore = score
}
// Validate against threshold
if qualityScore < qualityThreshold {
errMsg := fmt.Sprintf("Quality score too low: %.0f%% < %.0f%%", qualityScore*100, qualityThreshold*100)
return C.CString(errMsg)
}
// Validation passed
return nil
}
func main() {
// Register the validator with priority 50
if err := kreuzberg.RegisterValidator("quality_score_validator", 50,
(C.ValidatorCallback)(C.qualityScoreValidator)); err != nil {
log.Fatalf("failed to register validator: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterValidator("quality_score_validator"); err != nil {
log.Printf("warning: failed to unregister validator: %v", err)
}
}()
// Extract and validate
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction or validation failed: %v", err)
}
log.Printf("Quality validation passed for: %s", result.MimeType)
}
```

View File

@@ -0,0 +1,165 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"sync"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// PluginState manages thread-safe state for the stateful plugin
type PluginState struct {
mu sync.Mutex
callCount int
cache map[string]string
lastMimeType string
}
// globalState holds the plugin's persistent state across calls
var globalState = &PluginState{
cache: make(map[string]string),
}
// statefulPlugin demonstrates a thread-safe plugin with persistent state
//export statefulPlugin
func statefulPlugin(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Acquire lock to safely modify state
globalState.mu.Lock()
defer globalState.mu.Unlock()
// Increment call counter
globalState.callCount++
// Extract and store MIME type
if mimeType, ok := result["mime_type"].(string); ok {
globalState.lastMimeType = mimeType
globalState.cache[mimeType] = "processed"
}
// Ensure metadata exists
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Add state information to metadata
metadata["plugin_call_count"] = globalState.callCount
metadata["last_mime_type"] = globalState.lastMimeType
metadata["cached_types_count"] = len(globalState.cache)
metadata["plugin_version"] = "1.0.0"
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// GetPluginStats safely retrieves the current plugin state for logging
func GetPluginStats() (int, string, []string) {
globalState.mu.Lock()
defer globalState.mu.Unlock()
callCount := globalState.callCount
lastMime := globalState.lastMimeType
mimeTypes := make([]string, 0, len(globalState.cache))
for mimeType := range globalState.cache {
mimeTypes = append(mimeTypes, mimeType)
}
return callCount, lastMime, mimeTypes
}
// ResetPluginState clears the plugin state - useful for testing
func ResetPluginState() {
globalState.mu.Lock()
defer globalState.mu.Unlock()
globalState.callCount = 0
globalState.lastMimeType = ""
globalState.cache = make(map[string]string)
}
func main() {
// Register the stateful post-processor with priority 60
if err := kreuzberg.RegisterPostProcessor("stateful_plugin", 60,
(C.PostProcessorCallback)(C.statefulPlugin)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("stateful_plugin"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
// Print final statistics
callCount, lastMime, mimeTypes := GetPluginStats()
log.Printf("Plugin Statistics:")
log.Printf(" Total calls: %d", callCount)
log.Printf(" Last MIME type: %s", lastMime)
log.Printf(" Unique MIME types processed: %d", len(mimeTypes))
if len(mimeTypes) > 0 {
log.Printf(" Processed types: %v", mimeTypes)
}
}()
// Process multiple documents to demonstrate state accumulation
files := []string{
"document1.pdf",
"document2.pdf",
"image.png",
"document3.txt",
}
for _, file := range files {
log.Printf("Processing: %s", file)
result, err := kreuzberg.ExtractFileSync(file, nil)
if err != nil {
log.Printf(" Warning: extraction failed: %v", err)
continue
}
// Parse and display metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if callCount, ok := metadata["plugin_call_count"].(float64); ok {
log.Printf(" Plugin call count: %.0f", callCount)
}
if cachedCount, ok := metadata["cached_types_count"].(float64); ok {
log.Printf(" Cached MIME types: %.0f", cachedCount)
}
}
}
}
// Demonstrate thread-safe state access
callCount, lastMime, mimeTypes := GetPluginStats()
log.Printf("\nFinal Plugin State:")
log.Printf(" Total calls: %d", callCount)
log.Printf(" Last MIME type: %s", lastMime)
log.Printf(" Processed MIME types: %v", mimeTypes)
}
```

View File

@@ -0,0 +1,55 @@
```go title="Go"
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Unregister a specific document extractor
if err := kreuzberg.UnregisterDocumentExtractor("custom-json-extractor"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister document extractor: %v", err)
}
}
// Unregister a specific post-processor
if err := kreuzberg.UnregisterPostProcessor("word_count"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister post processor: %v", err)
}
}
// Unregister a specific OCR backend
if err := kreuzberg.UnregisterOCRBackend("cloud-ocr"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister OCR backend: %v", err)
}
}
// Unregister a specific validator
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister validator: %v", err)
}
}
fmt.Println("Plugins unregistered successfully")
}
```

View File

@@ -0,0 +1,90 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"strings"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// wordCountProcessor adds word count metadata to extraction results
//export wordCountProcessor
func wordCountProcessor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Extract content
content, ok := result["content"].(string)
if !ok {
return C.CString("{\"error\":\"Missing content field\"}")
}
// Count words by splitting on whitespace
words := strings.Fields(content)
wordCount := len(words)
// Ensure metadata exists
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Add word count to metadata
metadata["word_count"] = wordCount
// Update result
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 100, early stage
if err := kreuzberg.RegisterPostProcessor("word_count_processor", 100,
(C.PostProcessorCallback)(C.wordCountProcessor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("word_count_processor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
}()
// Extract document
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
// Access word count from metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if wordCount, ok := metadata["word_count"].(float64); ok {
log.Printf("Word count: %.0f", wordCount)
}
}
}
}
```