Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
maxChars := uint(500)
overlap := uint(50)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatal(err)
}
for _, chunk := range result.Chunks {
first := chunk.Metadata.FirstPage
last := chunk.Metadata.LastPage
if first == nil {
continue
}
pageRange := fmt.Sprintf("Page %d", *first)
if last != nil && *first != *last {
pageRange = fmt.Sprintf("Pages %d-%d", *first, *last)
}
preview := chunk.Content
if len(preview) > 50 {
preview = preview[:50]
}
fmt.Printf("Chunk: %s... (%s)\n", preview, pageRange)
}
}
```

View File

@@ -0,0 +1,27 @@
```go title="Go"
package main
import (
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 1000
maxOverlap := 200
normalize := true
batchSize := int32(32)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("all-minilm-l6-v2"),
Normalize: &normalize,
BatchSize: &batchSize,
},
},
}
_ = config
}
```

View File

@@ -0,0 +1,46 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 500
maxOverlap := 50
normalize := true
batchSize := int32(16)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("all-mpnet-base-v2"),
Normalize: &normalize,
BatchSize: &batchSize,
},
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("RAG extraction failed: %v", err)
}
chunks := result.Chunks
fmt.Printf("Found %d chunks for RAG pipeline\n", len(chunks))
for i := 0; i < len(chunks) && i < 3; i++ {
chunk := chunks[i]
content := chunk.Content
if len(content) > 80 {
content = content[:80]
}
fmt.Printf("Chunk %d: %s...\n", i, content)
}
}
```

View File

@@ -0,0 +1,49 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 512
maxOverlap := 50
normalize := true
batchSize := int32(32)
showProgress := false
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
Normalize: &normalize,
BatchSize: &batchSize,
ShowDownloadProgress: &showProgress,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
fmt.Printf("Error: %v\n", err)
return
}
for index, chunk := range result.Chunks {
chunkID := fmt.Sprintf("doc_chunk_%d", index)
content := chunk.Content
if len(content) > 50 {
content = content[:50]
}
fmt.Printf("Chunk %s: %s\n", chunkID, content)
if chunk.Embedding != nil && len(chunk.Embedding) > 0 {
fmt.Printf(" Embedding dimensions: %d\n", len(chunk.Embedding))
}
}
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxKeywords := int32(10)
minScore := 0.3
language := "en"
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
MaxKeywords: &maxKeywords,
MinScore: &minScore,
Language: &language,
},
}
_ = config
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxKeywords := int32(10)
minScore := 0.3
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
MaxKeywords: &maxKeywords,
MinScore: &minScore,
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
if keywords, ok := result.Metadata["keywords"]; ok {
keywordList := keywords.([]map[string]interface{})
for _, kw := range keywordList {
text := kw["text"].(string)
score := kw["score"].(float64)
fmt.Printf("%s: %.3f\n", text, score)
}
}
}
```

View File

@@ -0,0 +1,22 @@
```go title="Go"
package main
import (
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
enabled := true
detectMultiple := false
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &enabled,
MinConfidence: &minConfidence,
DetectMultiple: &detectMultiple,
},
}
_ = config
}
```

View File

@@ -0,0 +1,40 @@
```go title="Go"
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
enabled := true
detectMultiple := true
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &enabled,
MinConfidence: &minConfidence,
DetectMultiple: &detectMultiple,
},
}
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
if err != nil {
log.Fatalf("Processing failed: %v", err)
}
languages := result.DetectedLanguages
if len(languages) > 0 {
fmt.Printf("Detected %d language(s): %s\n", len(languages), strings.Join(languages, ", "))
} else {
fmt.Println("No languages detected")
}
fmt.Printf("Total content: %d characters\n", len(result.Content))
fmt.Printf("MIME type: %s\n", result.MimeType)
}
```

View File

@@ -0,0 +1,16 @@
```go title="Go"
package main
import (
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
enableQualityProcessing := true
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: &enableQualityProcessing,
}
_ = config
}
```

View File

@@ -0,0 +1,35 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
enableQualityProcessing := true
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: &enableQualityProcessing,
}
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
qualityScore := 0.0
if result.QualityScore != nil {
qualityScore = *result.QualityScore
}
if qualityScore < 0.5 {
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
} else {
fmt.Printf("Quality score: %.2f\n", qualityScore)
}
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preserveMarkdown := true
preserveCode := true
mode := "moderate"
languageHint := "eng"
config := &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: &mode,
PreserveMarkdown: &preserveMarkdown,
PreserveCode: &preserveCode,
LanguageHint: &languageHint,
},
}
_ = config
}
```

View File

@@ -0,0 +1,46 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preserveMarkdown := true
mode := "moderate"
config := &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: &mode,
PreserveMarkdown: &preserveMarkdown,
},
}
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
original := 0
reduced := 0
ratio := 0.0
if val, ok := result.Metadata["original_token_count"]; ok {
original = val.(int)
}
if val, ok := result.Metadata["token_count"]; ok {
reduced = val.(int)
}
if val, ok := result.Metadata["token_reduction_ratio"]; ok {
ratio = val.(float64)
}
fmt.Printf("Reduced from %d to %d tokens\n", original, reduced)
fmt.Printf("Reduction: %.1f%%\n", ratio*100)
}
```

View File

@@ -0,0 +1,67 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
type VectorRecord struct {
ID string
Embedding []float32
Content string
Metadata map[string]string
}
func extractAndVectorize(documentPath string, documentID string) ([]VectorRecord, error) {
maxChars := 512
maxOverlap := 50
normalize := true
batchSize := int32(32)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
Normalize: &normalize,
BatchSize: &batchSize,
},
},
}
result, err := kreuzberg.ExtractFileSync(documentPath, config)
if err != nil {
return nil, err
}
var vectorRecords []VectorRecord
for index, chunk := range result.Chunks {
record := VectorRecord{
ID: fmt.Sprintf("%s_chunk_%d", documentID, index),
Content: chunk.Content,
Embedding: chunk.Embedding,
Metadata: map[string]string{
"document_id": documentID,
"chunk_index": fmt.Sprintf("%d", index),
"content_length": fmt.Sprintf("%d", len(chunk.Content)),
},
}
vectorRecords = append(vectorRecords, record)
}
storeInVectorDatabase(vectorRecords)
return vectorRecords, nil
}
func storeInVectorDatabase(records []VectorRecord) {
for _, record := range records {
if len(record.Embedding) > 0 {
fmt.Printf("Storing %s: %d chars, %d dims\n",
record.ID, len(record.Content), len(record.Embedding))
}
}
}
```

View File

@@ -0,0 +1,27 @@
```go title="Go"
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
doc1, _ := os.ReadFile("doc1.pdf")
doc2, _ := os.ReadFile("doc2.docx")
items := []kreuzberg.BatchBytesItem{
{Content: doc1, MimeType: "application/pdf"},
{Content: doc2, MimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
}
results, err := kreuzberg.BatchExtractBytesSync(items, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("batch extraction failed: %v", err)
}
println("Processed", len(results), "documents")
}
```

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
items := []kreuzberg.BatchFileItem{
{Path: "doc1.pdf"},
{Path: "doc2.docx"},
{Path: "doc3.pptx"},
}
results, err := kreuzberg.BatchExtractFilesSync(items, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("batch extraction failed: %v", err)
}
for i, result := range results {
println("Doc", i, "content length:", len(result.Content))
}
}
```

View File

@@ -0,0 +1,42 @@
```go title="Go"
package main
import (
"bytes"
"encoding/json"
"io"
"log"
"net/http"
)
func main() {
client := &http.Client{}
payload := map[string]interface{}{
"text": "Your long text content here...",
"chunker_type": "text",
"config": map[string]interface{}{
"max_characters": 1000,
"overlap": 50,
"trim": true,
},
}
data, _ := json.Marshal(payload)
resp, err := client.Post("http://localhost:8000/chunk", "application/json", bytes.NewBuffer(data))
if err != nil {
log.Fatalf("request failed: %v", err)
}
defer resp.Body.Close()
var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
chunks := result["chunks"].([]interface{})
log.Printf("Created %d chunks", len(chunks))
for _, chunk := range chunks {
c := chunk.(map[string]interface{})
println("Chunk content:", c["content"].(string))
}
}
```

View File

@@ -0,0 +1,34 @@
```go title="Go"
package main
import (
"bytes"
"io"
"log"
"mime/multipart"
"net/http"
"os"
)
func main() {
file, err := os.Open("document.pdf")
if err != nil {
log.Fatalf("failed to open file: %v", err)
}
defer file.Close()
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
part, _ := writer.CreateFormFile("files", "document.pdf")
io.Copy(part, file)
writer.Close()
resp, err := http.Post("http://localhost:8000/extract", writer.FormDataContentType(), body)
if err != nil {
log.Fatalf("request failed: %v", err)
}
defer resp.Body.Close()
io.Copy(os.Stdout, resp.Body)
}
```

View File

@@ -0,0 +1,35 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
trueVal := true
maxChars := uint(1000)
overlap := uint(200)
config := kreuzberg.ExtractionConfig{
UseCache: &trueVal,
EnableQualityProcessing: &trueVal,
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng",
},
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content length:", len(result.Content))
println("Chunks:", len(result.Chunks))
}
```

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"errors"
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("missing.pdf", nil, kreuzberg.ExtractionConfig{})
if err != nil {
if errors.Is(err, kreuzberg.ErrIo) {
log.Printf("file not found: %v", err)
} else if errors.Is(err, kreuzberg.ErrUnsupportedFormat) {
log.Printf("unsupported format: %v", err)
} else {
log.Printf("extraction error: %v", err)
}
return
}
println("Content:", result.Content)
}
```

View File

@@ -0,0 +1,43 @@
```go title="Go"
package main
import (
"bytes"
"encoding/json"
"io"
"log"
"mime/multipart"
"net/http"
"os"
)
func main() {
file, err := os.Open("document.pdf")
if err != nil {
log.Fatalf("failed to open file: %v", err)
}
defer file.Close()
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
part, _ := writer.CreateFormFile("files", "document.pdf")
io.Copy(part, file)
writer.Close()
resp, err := http.Post("http://localhost:8000/extract", writer.FormDataContentType(), body)
if err != nil {
log.Fatalf("request failed: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
var errResp map[string]string
json.NewDecoder(resp.Body).Decode(&errResp)
log.Fatalf("error: %s: %s", errResp["error_type"], errResp["message"])
}
var result map[string]interface{}
json.NewDecoder(resp.Body).Decode(&result)
println("Success:", result["content"].(string))
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
content, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("failed to read file: %v", err)
}
result, err := kreuzberg.ExtractBytes(content, "application/pdf", kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content:", result.Content)
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
content, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("failed to read file: %v", err)
}
result, err := kreuzberg.ExtractBytesSync(content, "application/pdf", kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content:", result.Content)
}
```

View File

@@ -0,0 +1,19 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFile("document.pdf", nil, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content:", result.Content)
println("MIME type:", result.MimeType)
}
```

View File

@@ -0,0 +1,18 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content:", result.Content)
}
```

View File

@@ -0,0 +1,85 @@
```go title="simple_benchmark.go"
package main
import (
"fmt"
"sync"
"time"
"kreuzberg"
)
func main() {
config := &kreuzberg.ExtractionConfig{
UseCache: false,
}
client, err := kreuzberg.New(config)
if err != nil {
panic(err)
}
defer client.Close()
filePath := "document.pdf"
numRuns := 10
fmt.Printf("Sync extraction (%d runs):\n", numRuns)
start := time.Now()
for i := 0; i < numRuns; i++ {
_, err := client.ExtractFile(filePath)
if err != nil {
panic(err)
}
}
syncDuration := time.Since(start).Seconds()
avgSync := syncDuration / float64(numRuns)
fmt.Printf(" - Total time: %.3fs\n", syncDuration)
fmt.Printf(" - Average: %.3fs per extraction\n", avgSync)
fmt.Printf("\nAsync extraction (%d parallel runs):\n", numRuns)
start = time.Now()
var wg sync.WaitGroup
wg.Add(numRuns)
for i := 0; i < numRuns; i++ {
go func() {
defer wg.Done()
_, err := client.ExtractFile(filePath)
if err != nil {
panic(err)
}
}()
}
wg.Wait()
asyncDuration := time.Since(start).Seconds()
fmt.Printf(" - Total time: %.3fs\n", asyncDuration)
fmt.Printf(" - Average: %.3fs per extraction\n", asyncDuration/float64(numRuns))
fmt.Printf(" - Speedup: %.1fx\n", syncDuration/asyncDuration)
cacheConfig := &kreuzberg.ExtractionConfig{
UseCache: true,
}
clientCached, err := kreuzberg.New(cacheConfig)
if err != nil {
panic(err)
}
defer clientCached.Close()
fmt.Println("\nFirst extraction (populates cache)...")
start = time.Now()
_, err = clientCached.ExtractFile(filePath)
if err != nil {
panic(err)
}
firstDuration := time.Since(start).Seconds()
fmt.Printf(" - Time: %.3fs\n", firstDuration)
fmt.Println("Second extraction (from cache)...")
start = time.Now()
_, err = clientCached.ExtractFile(filePath)
if err != nil {
panic(err)
}
cachedDuration := time.Since(start).Seconds()
fmt.Printf(" - Time: %.3fs\n", cachedDuration)
fmt.Printf(" - Cache speedup: %.1fx\n", firstDuration/cachedDuration)
}
```

38
docs/snippets/go/cache/disk_cache.go vendored Normal file
View File

@@ -0,0 +1,38 @@
```go title="disk_cache.go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
useCache := true
namespace := "documents"
ttl := uint64(7 * 86400)
config := kreuzberg.ExtractionConfig{
UseCache: &useCache,
CacheNamespace: &namespace,
CacheTTLSecs: &ttl,
}
fmt.Println("First extraction (will be cached)...")
result1, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf(" - Content length: %d\n", len(result1.Content))
fmt.Println("\nSecond extraction (from cache)...")
result2, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf(" - Content length: %d\n", len(result2.Content))
fmt.Printf("\nResults are identical: %v\n", result1.Content == result2.Content)
}
```

View File

@@ -0,0 +1,35 @@
```go title="basic_cli.go"
package main
import (
"fmt"
"os/exec"
)
func extractWithCli(filePath string, outputFormat string) (string, error) {
cmd := exec.Command("kreuzberg", "extract", filePath, "--format", outputFormat)
output, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("CLI error: %w, output: %s", err, string(output))
}
return string(output), nil
}
func main() {
document := "document.pdf"
textOutput, err := extractWithCli(document, "text")
if err != nil {
panic(err)
}
fmt.Printf("Extracted: %d characters\n", len(textOutput))
jsonOutput, err := extractWithCli(document, "json")
if err != nil {
panic(err)
}
fmt.Printf("JSON output received: %d bytes\n", len(jsonOutput))
}
```

View File

@@ -0,0 +1,54 @@
```go title="cli_with_config.go"
package main
import (
"encoding/json"
"fmt"
"os/exec"
)
type ExtractionResult struct {
Content string `json:"content"`
Format string `json:"format"`
Languages []string `json:"languages"`
}
func extractWithConfig(filePath string, configPath string) (*ExtractionResult, error) {
cmd := exec.Command(
"kreuzberg",
"extract",
filePath,
"--config",
configPath,
"--format",
"json",
)
output, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("CLI error: %w, output: %s", err, string(output))
}
var result ExtractionResult
if err := json.Unmarshal(output, &result); err != nil {
return nil, fmt.Errorf("JSON parse error: %w", err)
}
return &result, nil
}
func main() {
configFile := "kreuzberg.toml"
document := "document.pdf"
fmt.Printf("Extracting %s with config %s\n", document, configFile)
result, err := extractWithConfig(document, configFile)
if err != nil {
panic(err)
}
fmt.Printf("Content length: %d\n", len(result.Content))
fmt.Printf("Format: %s\n", result.Format)
fmt.Printf("Languages: %v\n", result.Languages)
}
```

View File

@@ -0,0 +1,54 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
maxOverlap := uint(100)
useCache := true
enableQuality := true
languageDetectionEnabled := true
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+deu",
},
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &maxOverlap,
},
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &languageDetectionEnabled,
DetectMultiple: true,
},
UseCache: &useCache,
EnableQualityProcessing: &enableQuality,
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Access chunks
if len(result.Chunks) > 0 {
snippet := result.Chunks[0].Content
if len(snippet) > 100 {
snippet = snippet[:100]
}
fmt.Printf("First chunk: %s...\n", snippet)
}
// Access detected languages
if len(result.DetectedLanguages) > 0 {
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
}
}
```

View File

@@ -0,0 +1,103 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
overlap := uint(200)
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
fmt.Printf("Config: MaxCharacters=%d, Overlap=%d\n",
*config.Chunking.MaxCharacters, *config.Chunking.Overlap)
}
```
```go title="Go - Markdown with Heading Context"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(500)
overlap := uint(50)
model := "Xenova/gpt-4o"
chunkerType := kreuzberg.ChunkerTypeMarkdown
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
ChunkerType: &chunkerType,
Sizing: kreuzberg.ChunkSizing{
Type: "tokenizer",
Model: &model,
},
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for _, chunk := range result.Chunks {
if chunk.Metadata.HeadingContext != nil {
for _, heading := range chunk.Metadata.HeadingContext.Headings {
fmt.Printf("Heading L%d: %s\n", heading.Level, heading.Text)
}
}
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
```
```go title="Go - Prepend Heading Context"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(500)
overlap := uint(50)
chunkerType := kreuzberg.ChunkerTypeMarkdown
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
ChunkerType: &chunkerType,
PrependHeadingContext: true,
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for _, chunk := range result.Chunks {
// Each chunk's content is prefixed with its heading breadcrumb
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
useCache := true
enableQP := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("kreuzberg.toml")
if err != nil {
log.Fatalf("load config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Detected MIME: %s", result.MimeType)
}
```

View File

@@ -0,0 +1,19 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
psm := int32(3)
_ = kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+fra",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
},
},
}
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
psm := int32(6)
maxChars := uint(1000)
overlap := uint(200)
useCache := true
config := kreuzberg.ExtractionConfig{
UseCache: &useCache,
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
},
},
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,25 @@
```go title="Document Structure Config (Go)"
package main
import (
"fmt"
kreuzberg "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := kreuzberg.NewExtractionConfig(
kreuzberg.WithIncludeDocumentStructure(true),
)
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
if result.Document != nil {
for _, node := range result.Document.Nodes {
fmt.Printf("[%s]\n", node.Content.NodeType)
}
}
}
```

View File

@@ -0,0 +1,60 @@
```go title="Element-Based Output (Go)"
package main
import (
"fmt"
"kreuzberg"
)
func main() {
// Configure element-based output
config := &kreuzberg.ExtractionConfig{
OutputFormat: "element_based",
}
// Extract document
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
// Access elements
for _, element := range result.Elements {
fmt.Printf("Type: %s\n", element.ElementType)
text := element.Text
if len(text) > 100 {
text = text[:100]
}
fmt.Printf("Text: %s\n", text)
if element.Metadata.PageNumber != nil {
fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
}
if element.Metadata.Coordinates != nil {
coords := element.Metadata.Coordinates
fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
coords.Left, coords.Top, coords.Right, coords.Bottom)
}
fmt.Println("---")
}
// Filter by element type
var titles []kreuzberg.Element
for _, element := range result.Elements {
if element.ElementType == "title" {
titles = append(titles, element)
}
}
for _, title := range titles {
level, ok := title.Metadata.Additional["level"].(string)
if !ok {
level = "unknown"
}
fmt.Printf("[%s] %s\n", level, title.Text)
}
}
```

View File

@@ -0,0 +1,100 @@
package main
import (
"kreuzberg"
)
func main() {
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
embeddingConfig := kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: "balanced",
},
BatchSize: 32,
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "~/.cache/kreuzberg/embeddings",
}
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "custom",
ModelID: "BAAI/bge-small-en-v1.5",
Dimensions: 384,
},
BatchSize: 32,
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "", // Uses default: .kreuzberg/embeddings/
}
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "custom",
ModelID: "sentence-transformers/all-mpnet-base-v2",
Dimensions: 768,
},
BatchSize: 16, // Larger model requires smaller batch size
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "/var/cache/embeddings",
}
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
chunkingConfig := kreuzberg.ChunkingConfig{
MaxChars: 1024,
MaxOverlap: 100,
Preset: "balanced",
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: "balanced",
},
BatchSize: 32,
Normalize: true,
},
}
extractionConfig := kreuzberg.ExtractionConfig{
Chunking: &chunkingConfig,
}
_ = embeddingConfig
_ = extractionConfig
}
// Key parameter explanations:
//
// BatchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// Normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// CacheDir: Where to store downloaded models
// - Empty string: Uses .kreuzberg/embeddings/ in current directory
// - Non-empty: Custom directory for model storage
//
// ShowDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
batchSize := uint(16)
normalize := true
modelName := "all-mpnet-base-v2"
cfg := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: &modelName,
},
BatchSize: &batchSize,
Normalize: &normalize,
ShowDownloadProgress: true,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,94 @@
package main
import (
"kreuzberg"
)
func main() {
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
OcrCoverageThreshold: nil, // No OCR coverage threshold
}
pdfConfigBasic := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigBasic,
}
extractionConfigBasic := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigBasic,
}
// Use with ExtractFileSync or ExtractBytesSync
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: nil,
}
pdfConfigMinimal := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigMinimal,
}
extractionConfigMinimal := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigMinimal,
}
_ = extractionConfigMinimal
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
ocrThreshold := 0.5
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6),
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
}
pdfConfigOcr := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigOcr,
}
extractionConfigOcr := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigOcr,
}
_ = extractionConfigOcr
}
// Field descriptions:
//
// Enabled: *bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: *int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: *bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: *float64 (default: nil)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - nil means no OCR coverage-based triggering

View File

@@ -0,0 +1,31 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
outputFormat := kreuzberg.OutputFormatHTML
theme := kreuzberg.HTMLThemeGitHub
embedCSS := true
config := &kreuzberg.ExtractionConfig{
OutputFormat: &outputFormat,
HTMLOutput: &kreuzberg.HTMLOutputConfig{
Theme: &theme,
EmbedCSS: &embedCSS,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content) // HTML with kb-* classes
}
```

View File

@@ -0,0 +1,89 @@
package main
import (
"fmt"
"kreuzberg"
)
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
func basicYake() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "yake",
MaxKeywords: 10,
MinScore: 0.0,
NgramRange: [2]int{1, 3},
Language: "en",
YakeParams: nil,
RakeParams: nil,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
func advancedYake() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "yake",
MaxKeywords: 15,
MinScore: 0.1,
NgramRange: [2]int{1, 2},
Language: "en",
YakeParams: &kreuzberg.YakeParams{
WindowSize: 1,
},
RakeParams: nil,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
func rakeConfig() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "rake",
MaxKeywords: 10,
MinScore: 5.0,
NgramRange: [2]int{1, 3},
Language: "en",
YakeParams: nil,
RakeParams: &kreuzberg.RakeParams{
MinWordLength: 1,
MaxWordsPerPhrase: 3,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
func main() {
if err := basicYake(); err != nil {
fmt.Println("Error:", err)
}
}

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
NgramRange: "1,3",
Language: "en",
},
}
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
config.Keywords.Algorithm,
config.Keywords.MaxKeywords,
config.Keywords.MinScore)
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: true,
MinConfidence: &minConfidence,
DetectMultiple: false,
},
}
fmt.Printf("Language detection enabled: %v\n", config.LanguageDetection.Enabled)
fmt.Printf("Min confidence: %f\n", *config.LanguageDetection.MinConfidence)
}
```

View File

@@ -0,0 +1,28 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
targetDpi := int32(300)
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
TesseractConfig: &kreuzberg.TesseractConfig{
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
TargetDpi: &targetDpi,
},
},
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
extractMetadata := true
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
ExtractMetadata: &extractMetadata,
Passwords: []string{"password1", "password2"},
Hierarchy: &kreuzberg.HierarchyConfig{},
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,42 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
enabled := true
includeBbox := true
kClusters := uint(6)
kClustersAdvanced := uint(12)
threshold := float32(0.8)
// Basic hierarchy configuration
config := kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: &enabled,
KClusters: &kClusters,
IncludeBbox: &includeBbox,
OcrCoverageThreshold: &threshold,
},
},
}
// Advanced hierarchy configuration with more clusters
advancedConfig := kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: &enabled,
KClusters: &kClustersAdvanced,
IncludeBbox: &includeBbox,
OcrCoverageThreshold: &threshold,
},
},
}
_ = config
_ = advancedConfig
}
```

View File

@@ -0,0 +1,18 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
enabled := true
cfg := &kreuzberg.ExtractionConfig{
Postprocessor: &kreuzberg.PostProcessorConfig{
Enabled: &enabled,
EnabledProcessors: []string{"deduplication", "whitespace_normalization"},
DisabledProcessors: []string{"mojibake_fix"},
},
}
_ = cfg
}
```

View File

@@ -0,0 +1,17 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: true, // Default
}
fmt.Printf("Quality processing enabled: %v\n", config.EnableQualityProcessing)
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
psm := int32(6)
oem := int32(1)
enableTableDetection := true
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+fra+deu",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
Oem: &oem,
MinConfidence: 0.8,
EnableTableDetection: &enableTableDetection,
TesseditCharWhitelist: whitelist,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preserveImportant := true
config := kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionOptions{
Mode: "moderate",
PreserveImportantWords: &preserveImportant,
},
}
fmt.Printf("Mode: %s, Preserve Important Words: %v\n",
config.TokenReduction.Mode,
*config.TokenReduction.PreserveImportantWords)
}
```

View File

@@ -0,0 +1,118 @@
```go title="usage.go"
package main
import (
"bytes"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"os/exec"
"path/filepath"
"time"
)
type DockerKreuzbergClient struct {
containerName string
containerImage string
apiPort int
}
func NewDockerKreuzbergClient(containerName, image string, port int) *DockerKreuzbergClient {
return &DockerKreuzbergClient{
containerName: containerName,
containerImage: image,
apiPort: port,
}
}
func (c *DockerKreuzbergClient) StartContainer() error {
fmt.Println("Starting Kreuzberg Docker container...")
cmd := exec.Command("docker", "run", "-d",
"--name", c.containerName,
"-p", fmt.Sprintf("%d:8000", c.apiPort),
c.containerImage)
if err := cmd.Run(); err != nil {
return fmt.Errorf("failed to start container: %w", err)
}
fmt.Printf("Container started on http://localhost:%d\n", c.apiPort)
return nil
}
func (c *DockerKreuzbergClient) ExtractFile(filePath string) (string, error) {
fileBytes, err := os.ReadFile(filePath)
if err != nil {
return "", err
}
var buf bytes.Buffer
writer := multipart.NewWriter(&buf)
part, err := writer.CreateFormFile("file", filepath.Base(filePath))
if err != nil {
return "", err
}
if _, err := io.Copy(part, bytes.NewReader(fileBytes)); err != nil {
return "", err
}
if err := writer.Close(); err != nil {
return "", err
}
resp, err := http.Post(
fmt.Sprintf("http://localhost:%d/api/extract", c.apiPort),
writer.FormDataContentType(),
&buf,
)
if err != nil {
return "", err
}
defer resp.Body.Close()
var result map[string]string
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", err
}
return result["content"], nil
}
func (c *DockerKreuzbergClient) StopContainer() error {
fmt.Println("Stopping Kreuzberg Docker container...")
if err := exec.Command("docker", "stop", c.containerName).Run(); err != nil {
return err
}
if err := exec.Command("docker", "rm", c.containerName).Run(); err != nil {
return err
}
fmt.Println("Container stopped and removed")
return nil
}
func main() {
client := NewDockerKreuzbergClient("kreuzberg-api", "kreuzberg:latest", 8000)
if err := client.StartContainer(); err != nil {
panic(err)
}
time.Sleep(2 * time.Second)
content, err := client.ExtractFile("document.pdf")
if err != nil {
panic(err)
}
fmt.Printf("Extracted content:\n%s\n", content)
if err := client.StopContainer(); err != nil {
panic(err)
}
}
```

View File

@@ -0,0 +1,29 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println("Content:")
fmt.Println(result.Content)
fmt.Println("\nMetadata:")
if result.Metadata != nil {
fmt.Printf("Title: %v\n", result.Metadata["title"])
fmt.Printf("Author: %v\n", result.Metadata["author"])
}
fmt.Printf("\nTables found: %d\n", len(result.Tables))
fmt.Printf("Images found: %d\n", len(result.Images))
}
```

View File

@@ -0,0 +1,31 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
useCache := true
enableQP := true
config := &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
}
result, err := kreuzberg.ExtractFileSync("contract.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf("Extracted %d characters\n", len(result.Content))
if result.QualityScore != nil {
fmt.Printf("Quality score: %.2f\n", *result.QualityScore)
}
fmt.Printf("Processing time: %v\n", result.ProcessingTime)
}
```

View File

@@ -0,0 +1,30 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
ocrConfig := &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng",
}
config := kreuzberg.ExtractionConfig{
Ocr: ocrConfig,
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println("Extracted text from scanned document:")
fmt.Println(result.Content)
fmt.Println("Used OCR backend: tesseract")
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println("Extracted content:")
if len(result.Content) > 200 {
fmt.Println(result.Content[:200])
} else {
fmt.Println(result.Content)
}
}
```

View File

@@ -0,0 +1,22 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
fmt.Println("Kreuzberg CGO bindings loaded successfully")
result, err := kreuzberg.ExtractFileSync("sample.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println("Installation verified!")
fmt.Printf("Extracted %d characters\n", len(result.Content))
}
```

View File

@@ -0,0 +1,34 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
content := result.Content
tables := result.Tables
images := result.Images
metadata := result.Metadata
fmt.Printf("Content: %d characters\n", len(content))
fmt.Printf("Tables: %d\n", len(tables))
fmt.Printf("Images: %d\n", len(images))
if metadata != nil {
fmt.Print("Metadata keys: ")
for key := range metadata {
fmt.Print(key + " ")
}
fmt.Println()
}
}
```

View File

@@ -0,0 +1,47 @@
```go title="Go"
package main
import (
"encoding/json"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
schema, err := json.Marshal(map[string]any{
"type": "object",
"properties": map[string]any{
"title": map[string]string{"type": "string"},
"authors": map[string]any{"type": "array", "items": map[string]string{"type": "string"}},
"date": map[string]string{"type": "string"},
},
"required": []string{"title", "authors", "date"},
"additionalProperties": false,
})
if err != nil {
log.Fatalf("marshal schema: %v", err)
}
config := kreuzberg.ExtractionConfig{
StructuredExtraction: &kreuzberg.StructuredExtractionConfig{
Schema: schema,
SchemaName: "PaperMetadata",
Strict: true,
Llm: kreuzberg.LlmConfig{
Model: "openai/gpt-4o-mini",
},
},
}
result, err := kreuzberg.ExtractFile("paper.pdf", nil, config)
if err != nil {
log.Fatalf("extract: %v", err)
}
if result.StructuredOutput != nil {
fmt.Println(string(*result.StructuredOutput))
}
}
```

View File

@@ -0,0 +1,63 @@
```go title="Go"
package main
import (
"bufio"
"encoding/json"
"fmt"
"log"
"os/exec"
)
type MCPRequest struct {
Method string `json:"method"`
Params MCPParams `json:"params"`
}
type MCPParams struct {
Name string `json:"name"`
Arguments map[string]interface{} `json:"arguments"`
}
func main() {
cmd := exec.Command("kreuzberg", "mcp")
stdin, err := cmd.StdinPipe()
if err != nil {
log.Fatalf("create stdin pipe: %v", err)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatalf("create stdout pipe: %v", err)
}
if err := cmd.Start(); err != nil {
log.Fatalf("start command: %v", err)
}
request := MCPRequest{
Method: "tools/call",
Params: MCPParams{
Name: "extract_file",
Arguments: map[string]interface{}{
"path": "document.pdf",
"async": true,
},
},
}
data, err := json.Marshal(request)
if err != nil {
log.Fatalf("marshal request: %v", err)
}
fmt.Fprintf(stdin, "%s\n", string(data))
scanner := bufio.NewScanner(stdout)
if scanner.Scan() {
fmt.Println(scanner.Text())
}
if err := cmd.Wait(); err != nil {
log.Fatalf("wait for command: %v", err)
}
}
```

View File

@@ -0,0 +1,19 @@
```go title="Go"
package main
import (
"fmt"
"os"
"os/exec"
)
func main() {
cmd := exec.Command("kreuzberg", "mcp")
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
fmt.Fprintf(os.Stderr, "Failed to start MCP server: %v\n", err)
}
}
```

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
enabled := true
minConfidence := 0.9
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &enabled,
MinConfidence: &minConfidence,
DetectMultiple: true,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,29 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: true,
MinConfidence: &minConfidence,
DetectMultiple: true,
},
}
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf("Detected languages: %v\n", result.DetectedLanguages)
// Output: [eng fra deu]
}
```

View File

@@ -0,0 +1,115 @@
```go title="Go"
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract pdf: %v", err)
}
// Access PDF metadata
if pdf, ok := result.Metadata.PdfMetadata(); ok {
if pdf.PageCount != nil {
fmt.Printf("Pages: %d\n", *pdf.PageCount)
}
if pdf.Author != nil {
fmt.Printf("Author: %s\n", *pdf.Author)
}
if pdf.Title != nil {
fmt.Printf("Title: %s\n", *pdf.Title)
}
}
// Access HTML metadata
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
if err != nil {
log.Fatalf("extract html: %v", err)
}
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
if html.Title != nil {
fmt.Printf("Title: %s\n", *html.Title)
}
if html.Description != nil {
fmt.Printf("Description: %s\n", *html.Description)
}
// Access keywords as array
if len(html.Keywords) > 0 {
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
}
// Access canonical URL (renamed from canonical)
if html.CanonicalURL != nil {
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
}
// Access Open Graph fields from map
if len(html.OpenGraph) > 0 {
if image, ok := html.OpenGraph["image"]; ok {
fmt.Printf("Open Graph Image: %s\n", image)
}
if ogTitle, ok := html.OpenGraph["title"]; ok {
fmt.Printf("Open Graph Title: %s\n", ogTitle)
}
if ogType, ok := html.OpenGraph["type"]; ok {
fmt.Printf("Open Graph Type: %s\n", ogType)
}
}
// Access Twitter Card fields from map
if len(html.TwitterCard) > 0 {
if card, ok := html.TwitterCard["card"]; ok {
fmt.Printf("Twitter Card Type: %s\n", card)
}
if creator, ok := html.TwitterCard["creator"]; ok {
fmt.Printf("Twitter Creator: %s\n", creator)
}
}
// Access new fields
if html.Language != nil {
fmt.Printf("Language: %s\n", *html.Language)
}
if html.TextDirection != nil {
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
}
// Access headers
if len(html.Headers) > 0 {
headers := make([]string, len(html.Headers))
for i, h := range html.Headers {
headers[i] = h.Text
}
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
}
// Access links
if len(html.Links) > 0 {
for _, link := range html.Links {
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
}
}
// Access images
if len(html.Images) > 0 {
for _, image := range html.Images {
fmt.Printf("Image: %s\n", image.Src)
}
}
// Access structured data
if len(html.StructuredData) > 0 {
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
}
}
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatal(err)
}
if result.Metadata.Pages == nil || result.Metadata.Pages.Boundaries == nil {
return
}
contentBytes := []byte(result.Content)
for i, boundary := range result.Metadata.Pages.Boundaries {
if i >= 3 {
break
}
pageText := string(contentBytes[boundary.ByteStart:boundary.ByteEnd])
preview := pageText
if len(preview) > 100 {
preview = preview[:100]
}
fmt.Printf("Page %d:\n", boundary.PageNumber)
fmt.Printf(" Byte range: %d-%d\n", boundary.ByteStart, boundary.ByteEnd)
fmt.Printf(" Preview: %s...\n", preview)
}
}
```

View File

@@ -0,0 +1,29 @@
Package main
Import (
"fmt"
"Kreuzberg"
)
Func main() {
config := &kreuzberg.ExtractionConfig{
Pages: &kreuzberg.PageConfig{
ExtractPages: true,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
if result.Pages != nil {
for _, page := range result.Pages {
fmt.Printf("Page %d:\n", page.PageNumber)
fmt.Printf(" Content: %d chars\n", len(page.Content))
fmt.Printf(" Tables: %d\n", len(page.Tables))
fmt.Printf(" Images: %d\n", len(page.Images))
}
}
}

View File

@@ -0,0 +1,28 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Iterate over tables
for _, table := range result.Tables {
fmt.Printf("Table with %d rows\n", len(table.Cells))
fmt.Println(table.Markdown) // Markdown representation
// Access cells
for _, row := range table.Cells {
fmt.Println(row)
}
}
}
```

View File

@@ -0,0 +1,39 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 512
maxOverlap := 50
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: "balanced",
Normalize: true,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
if result.Chunks != nil {
for i, chunk := range result.Chunks {
if chunk.Embedding != nil {
fmt.Printf("Chunk %d: %d dimensions\n", i, len(chunk.Embedding))
// Store in vector database
}
}
}
}
```

View File

@@ -0,0 +1,25 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
// The Go binding does not currently expose plugin OCR backend registration.
// Use one of the built-in backends ("tesseract", "paddle-ocr", or VLM via "vlm").
func main() {
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng",
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,32 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
extractImages := true
injectPlaceholders := true
autoAdjustDpi := true
targetDpi := int32(200)
maxDim := int32(2048)
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
Images: &kreuzberg.ImageExtractionConfig{
ExtractImages: &extractImages,
TargetDpi: &targetDpi,
MaxImageDimension: &maxDim,
InjectPlaceholders: &injectPlaceholders, // set to false to extract images without markdown references
AutoAdjustDpi: &autoAdjustDpi,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,36 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
targetDpi := int32(300)
deskew := true
binarization := "otsu"
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
TesseractConfig: &kreuzberg.TesseractConfig{
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
TargetDpi: &targetDpi,
Denoise: true,
Deskew: &deskew,
ContrastEnhance: true,
BinarizationMethod: &binarization,
},
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1 @@
EasyOCR is only available in Python.

View File

@@ -0,0 +1,34 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
cfg := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "paddle-ocr",
Language: "en",
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for _, element := range result.OcrElements {
fmt.Printf("Text: %s\n", element.Text)
fmt.Printf("Confidence: %.2f\n", element.Confidence.Recognition)
fmt.Printf("Geometry: %+v\n", element.Geometry)
if element.Rotation != nil {
fmt.Printf("Rotation: %.1f°\n", element.Rotation.AngleDegrees)
}
fmt.Println()
}
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
cfg := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng",
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
},
ForceOcr: true,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("multilingual.pdf", nil, kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+deu+fra",
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(result.Content)
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
cfg := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "paddle-ocr",
Language: "en",
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
```

View File

@@ -0,0 +1,32 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Clear all plugins of a specific type
if err := kreuzberg.ClearPostProcessors(); err != nil {
log.Fatalf("clear post processors: %v", err)
}
log.Println("Post processors cleared")
if err := kreuzberg.ClearValidators(); err != nil {
log.Fatalf("clear validators: %v", err)
}
log.Println("Validators cleared")
if err := kreuzberg.ClearOCRBackends(); err != nil {
log.Fatalf("clear OCR backends: %v", err)
}
log.Println("OCR backends cleared")
if err := kreuzberg.ClearDocumentExtractors(); err != nil {
log.Fatalf("clear document extractors: %v", err)
}
log.Println("Document extractors cleared")
}
```

View File

@@ -0,0 +1,64 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
// MyEmbedder wraps an already-loaded embedder so kreuzberg can call back into
// it during chunking and standalone embed requests. Implement the
// kreuzberg.EmbeddingBackend interface.
type MyEmbedder struct{}
func (e *MyEmbedder) Name() string { return "my-embedder" }
func (e *MyEmbedder) Version() string { return "1.0.0" }
func (e *MyEmbedder) Initialize() error {
// Optional warm-up; runs once at registration before Dimensions() is cached.
return nil
}
func (e *MyEmbedder) Shutdown() error { return nil }
// Captured once at registration; the dispatcher uses this for shape validation.
func (e *MyEmbedder) Dimensions() uint { return 768 }
func (e *MyEmbedder) Embed(texts []string) ([][]float32, error) {
// Delegate to the already-loaded host model.
out := make([][]float32, len(texts))
for i := range texts {
out[i] = make([]float32, 768)
}
return out, nil
}
func main() {
// Register once at startup.
if err := kreuzberg.RegisterEmbeddingBackend(&MyEmbedder{}); err != nil {
log.Fatalf("failed to register embedding backend: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterEmbeddingBackend("my-embedder"); err != nil {
log.Printf("warning: failed to unregister embedding backend: %v", err)
}
}()
maxDuration := uint64(30)
embedderName := "my-embedder"
config := kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Variant: "plugin",
Type: "plugin",
Name: &embedderName,
},
// Optional: bound the wait on a hung backend (default 60s; nil disables).
MaxEmbedDurationSecs: &maxDuration,
}
vectors, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Second text"}, config)
if err != nil {
log.Fatalf("embed failed: %v", err)
}
log.Printf("Generated %d vectors", len(vectors))
}
```

View File

@@ -0,0 +1,22 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Register custom extractor with priority 50
if err := kreuzberg.RegisterDocumentExtractor("custom-json-extractor", 50); err != nil {
log.Fatalf("register extractor failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.json", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Extracted content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,52 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// List all registered document extractors
extractors, err := kreuzberg.ListDocumentExtractors()
if err != nil {
log.Fatalf("list document extractors: %v", err)
}
fmt.Println("Document Extractors:")
for _, extractor := range extractors {
fmt.Printf(" - %s\n", extractor)
}
// List all registered post-processors
processors, err := kreuzberg.ListPostProcessors()
if err != nil {
log.Fatalf("list post processors: %v", err)
}
fmt.Println("\nPost-Processors:")
for _, processor := range processors {
fmt.Printf(" - %s\n", processor)
}
// List all registered OCR backends
backends, err := kreuzberg.ListOCRBackends()
if err != nil {
log.Fatalf("list OCR backends: %v", err)
}
fmt.Println("\nOCR Backends:")
for _, backend := range backends {
fmt.Printf(" - %s\n", backend)
}
// List all registered validators
validators, err := kreuzberg.ListValidators()
if err != nil {
log.Fatalf("list validators: %v", err)
}
fmt.Println("\nValidators:")
for _, validator := range validators {
fmt.Printf(" - %s\n", validator)
}
}
```

View File

@@ -0,0 +1,72 @@
```go title="Go"
package main
import (
"encoding/json"
"fmt"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// minLengthConfig holds the configuration for the minimum length validator
var minLengthConfig = struct {
minLength int
}{
minLength: 100,
}
// minLengthValidator validates that extracted content meets minimum length requirement
//export minLengthValidator
func minLengthValidator(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse result JSON")
}
content, ok := result["content"].(string)
if !ok {
return C.CString("Missing content field in result")
}
if len(content) < minLengthConfig.minLength {
errMsg := fmt.Sprintf("Content too short: %d < %d", len(content), minLengthConfig.minLength)
return C.CString(errMsg)
}
// Validation passed
return nil
}
func main() {
// Register the validator with priority 100 (runs early)
if err := kreuzberg.RegisterValidator("min_length_validator", 100,
(C.ValidatorCallback)(C.minLengthValidator)); err != nil {
log.Fatalf("failed to register validator: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
log.Printf("warning: failed to unregister validator: %v", err)
}
}()
// Extract and validate
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
log.Printf("Validation passed. Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,114 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"sync/atomic"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// pdfMetadataState tracks statistics about PDF processing
var pdfMetadataState = struct {
processedCount int64
}{
processedCount: 0,
}
// pdfMetadataExtractor enriches PDF extraction results with additional metadata
//export pdfMetadataExtractor
func pdfMetadataExtractor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Only process PDFs
mimeType, ok := result["mime_type"].(string)
if !ok || mimeType != "application/pdf" {
// Return unchanged for non-PDF documents
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// Process PDF-specific metadata
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Mark as processed by this processor
metadata["pdf_processed"] = true
// Add content statistics
content, ok := result["content"].(string)
if ok {
metadata["content_length"] = len(content)
}
// Increment processed count atomically
atomic.AddInt64(&pdfMetadataState.processedCount, 1)
metadata["pdf_processor_version"] = "1.0.0"
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 80, early stage
if err := kreuzberg.RegisterPostProcessor("pdf_metadata_extractor", 80,
(C.PostProcessorCallback)(C.pdfMetadataExtractor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("pdf_metadata_extractor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
log.Printf("Total PDFs processed: %d", atomic.LoadInt64(&pdfMetadataState.processedCount))
}()
// Extract PDF document
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
log.Printf("PDF MIME type: %s", result.MimeType)
// Parse and display metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if pdfProcessed, ok := metadata["pdf_processed"].(bool); ok && pdfProcessed {
log.Printf("PDF metadata extracted successfully")
if contentLen, ok := metadata["content_length"].(float64); ok {
log.Printf("Content length: %.0f bytes", contentLen)
}
}
}
}
}
```

View File

@@ -0,0 +1,116 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// pdfOnlyProcessor applies PDF-specific processing logic only to PDF documents
//export pdfOnlyProcessor
func pdfOnlyProcessor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Check MIME type - only process PDFs
mimeType, ok := result["mime_type"].(string)
if !ok || mimeType != "application/pdf" {
// Return unchanged for non-PDF documents
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// Perform PDF-specific processing
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Example PDF-specific processing:
// - Extract tables as structured data
// - Handle PDF-specific formatting
// - Preserve document hierarchy
metadata["pdf_specific_processing"] = true
metadata["processor_type"] = "pdf_only"
// Check for tables in PDF
if tablesJSON, ok := result["tables_json"].(string); ok && tablesJSON != "" {
var tables []interface{}
if err := json.Unmarshal([]byte(tablesJSON), &tables); err == nil {
metadata["table_count"] = len(tables)
}
}
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 70
if err := kreuzberg.RegisterPostProcessor("pdf_only_processor", 70,
(C.PostProcessorCallback)(C.pdfOnlyProcessor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("pdf_only_processor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
}()
// Process multiple documents - processor will only affect PDFs
files := []string{
"document.pdf",
"image.jpg",
"spreadsheet.xlsx",
}
for _, file := range files {
result, err := kreuzberg.ExtractFileSync(file, nil)
if err != nil {
log.Printf("Warning: extraction failed for %s: %v", file, err)
continue
}
// Parse metadata to check if PDF processing occurred
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if pdfProcessing, ok := metadata["pdf_specific_processing"].(bool); ok && pdfProcessing {
log.Printf("PDF-specific processing applied to: %s", file)
if tableCount, ok := metadata["table_count"].(float64); ok {
log.Printf(" Tables found: %.0f", tableCount)
}
} else {
log.Printf("Skipped PDF processor for: %s (MIME: %s)", file, result.MimeType)
}
}
}
}
}
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="Go bindings do not support custom document extractor plugins" -->
```markdown title="Markdown"
!!! note "Not Supported"
The Go binding is a thin CGO wrapper and does not currently support
custom document extractors. Custom plugins must be implemented in Rust.
See the [Rust plugin documentation](../../rust/plugins/plugin_extractor.md) for details on creating custom document extractors.
Go currently supports:
- **PostProcessor** - Transform extraction results
- **Validator** - Validate extraction results
- **OcrBackend** - Custom OCR implementations
```

View File

@@ -0,0 +1,92 @@
```go title="Go"
package main
import (
"C"
"encoding/json"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
//export loggingPostProcessor
func loggingPostProcessor(resultJSON *C.char) *C.char {
log.Println("[PostProcessor] Processing extraction result")
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
log.Printf("[PostProcessor] Error parsing result: %v", err)
return nil
}
if content, ok := result["content"].(string); ok {
log.Printf("[PostProcessor] Content length: %d bytes", len(content))
if len(content) == 0 {
log.Println("[PostProcessor] Warning: Empty content extracted")
}
}
if mimeType, ok := result["mime_type"].(string); ok {
log.Printf("[PostProcessor] Processing %s", mimeType)
}
// Return NULL to indicate success (no modification)
return nil
}
//export loggingValidator
func loggingValidator(resultJSON *C.char) *C.char {
log.Println("[Validator] Validating extraction result")
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
log.Printf("[Validator] Error parsing result: %v", err)
errMsg := "Failed to parse validation input"
return C.CString(errMsg)
}
if content, ok := result["content"].(string); ok {
log.Printf("[Validator] Content length: %d bytes", len(content))
if len(content) < 50 {
log.Println("[Validator] Error: Content below minimum threshold")
errMsg := "Content too short (minimum 50 characters)"
return C.CString(errMsg)
}
}
// Return NULL to indicate validation passed
return nil
}
func main() {
// Register post processor with logging
if err := kreuzberg.RegisterPostProcessor(
"logging-processor",
100, // priority
(C.PostProcessorCallback)(C.loggingPostProcessor),
); err != nil {
log.Fatalf("register post processor failed: %v", err)
}
log.Println("[Main] PostProcessor registered with logging enabled")
// Register validator with logging
if err := kreuzberg.RegisterValidator(
"logging-validator",
50, // priority
(C.ValidatorCallback)(C.loggingValidator),
); err != nil {
log.Fatalf("register validator failed: %v", err)
}
log.Println("[Main] Validator registered with logging enabled")
// Extract with logging
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("[Main] Extraction complete: %d bytes content", len(result.Content))
}
```

View File

@@ -0,0 +1,213 @@
```go title="Go"
package main
import (
"C"
"encoding/json"
"fmt"
"testing"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
// TestPostProcessor tests custom post processor behavior
func TestPostProcessor(t *testing.T) {
// Create a post processor that adds metadata
metricsMap := make(map[string]int64)
//export testPostProcessor
testPostProcessor := func(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return nil
}
if content, ok := result["content"].(string); ok {
metricsMap["content_length"] = int64(len(content))
metricsMap["processed"] = 1
}
return nil
}
// Register the processor
err := kreuzberg.RegisterPostProcessor(
"test-processor",
10,
(C.PostProcessorCallback)(C.testPostProcessor),
)
if err != nil {
t.Fatalf("Failed to register post processor: %v", err)
}
// Simulate a mock result
mockResult := map[string]interface{}{
"content": "Test extraction content",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
resultJSON, err := json.Marshal(mockResult)
if err != nil {
t.Fatalf("Failed to marshal mock result: %v", err)
}
cResultJSON := C.CString(string(resultJSON))
defer C.free(unsafe.Pointer(cResultJSON))
// Call the processor
testPostProcessor(cResultJSON)
// Verify metrics were recorded
if metricsMap["content_length"] != 22 {
t.Errorf("Expected content_length 22, got %d", metricsMap["content_length"])
}
if metricsMap["processed"] != 1 {
t.Errorf("Expected processed flag to be 1")
}
// Cleanup
_ = kreuzberg.UnregisterPostProcessor("test-processor")
}
// TestValidator tests custom validator behavior
func TestValidator(t *testing.T) {
validatorCalled := false
//export testValidator
testValidator := func(resultJSON *C.char) *C.char {
validatorCalled = true
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse validation input")
}
if content, ok := result["content"].(string); ok {
if len(content) < 10 {
return C.CString("Content too short")
}
}
return nil // Success
}
// Register the validator
err := kreuzberg.RegisterValidator(
"test-validator",
50,
(C.ValidatorCallback)(C.testValidator),
)
if err != nil {
t.Fatalf("Failed to register validator: %v", err)
}
// Test 1: Valid content
validContent := map[string]interface{}{
"content": "This is valid content",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
validJSON, err := json.Marshal(validContent)
if err != nil {
t.Fatalf("Failed to marshal valid content: %v", err)
}
cValidJSON := C.CString(string(validJSON))
defer C.free(unsafe.Pointer(cValidJSON))
result := testValidator(cValidJSON)
if result != nil {
t.Errorf("Expected nil (success), got error: %s", C.GoString(result))
}
if !validatorCalled {
t.Errorf("Validator was not called")
}
// Test 2: Invalid content (too short)
invalidContent := map[string]interface{}{
"content": "Short",
"mime_type": "text/plain",
"metadata": map[string]interface{}{},
"tables": []interface{}{},
"detected_languages": []interface{}{},
}
invalidJSON, err := json.Marshal(invalidContent)
if err != nil {
t.Fatalf("Failed to marshal invalid content: %v", err)
}
cInvalidJSON := C.CString(string(invalidJSON))
defer C.free(unsafe.Pointer(cInvalidJSON))
result = testValidator(cInvalidJSON)
if result == nil {
t.Errorf("Expected error for short content, got nil")
} else {
errorMsg := C.GoString(result)
if errorMsg != "Content too short" {
t.Errorf("Expected 'Content too short', got: %s", errorMsg)
}
}
// Cleanup
_ = kreuzberg.UnregisterValidator("test-validator")
}
// TestValidatorIntegration tests validator with actual extraction
func TestValidatorIntegration(t *testing.T) {
//export integrationValidator
integrationValidator := func(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString(fmt.Sprintf("Parse error: %v", err))
}
// Validate that mime_type is set
if _, ok := result["mime_type"]; !ok {
return C.CString("Missing mime_type in result")
}
return nil
}
// Register validator
err := kreuzberg.RegisterValidator(
"integration-validator",
100,
(C.ValidatorCallback)(C.integrationValidator),
)
if err != nil {
t.Fatalf("Failed to register validator: %v", err)
}
// The validator will be called automatically during extraction
// This test verifies the registration was successful
validators, err := kreuzberg.ListValidators()
if err != nil {
t.Fatalf("Failed to list validators: %v", err)
}
found := false
for _, v := range validators {
if v == "integration-validator" {
found = true
break
}
}
if !found {
t.Errorf("Validator not found in registered validators list")
}
// Cleanup
_ = kreuzberg.UnregisterValidator("integration-validator")
}
```

View File

@@ -0,0 +1,35 @@
```go title="Go"
package main
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
import (
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
//export customValidator
func customValidator(resultJSON *C.char) *C.char {
// Inspect resultJSON, return error message or NULL
return nil
}
func main() {
if err := kreuzberg.RegisterValidator("go-validator", 50, (C.ValidatorCallback)(C.customValidator)); err != nil {
log.Fatalf("register validator failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,77 @@
```go title="Go"
package main
import (
"encoding/json"
"fmt"
"log"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// qualityThreshold is the minimum acceptable quality score
const qualityThreshold = 0.5
// qualityScoreValidator validates that extraction quality meets minimum threshold
//export qualityScoreValidator
func qualityScoreValidator(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("Failed to parse result JSON")
}
// Extract metadata object
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
// No metadata is not an error, just skip quality check
return nil
}
// Get quality score from result
qualityScore := 0.0
if score, ok := result["quality_score"].(float64); ok {
qualityScore = score
}
// Validate against threshold
if qualityScore < qualityThreshold {
errMsg := fmt.Sprintf("Quality score too low: %.0f%% < %.0f%%", qualityScore*100, qualityThreshold*100)
return C.CString(errMsg)
}
// Validation passed
return nil
}
func main() {
// Register the validator with priority 50
if err := kreuzberg.RegisterValidator("quality_score_validator", 50,
(C.ValidatorCallback)(C.qualityScoreValidator)); err != nil {
log.Fatalf("failed to register validator: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterValidator("quality_score_validator"); err != nil {
log.Printf("warning: failed to unregister validator: %v", err)
}
}()
// Extract and validate
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction or validation failed: %v", err)
}
log.Printf("Quality validation passed for: %s", result.MimeType)
}
```

View File

@@ -0,0 +1,165 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"sync"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// PluginState manages thread-safe state for the stateful plugin
type PluginState struct {
mu sync.Mutex
callCount int
cache map[string]string
lastMimeType string
}
// globalState holds the plugin's persistent state across calls
var globalState = &PluginState{
cache: make(map[string]string),
}
// statefulPlugin demonstrates a thread-safe plugin with persistent state
//export statefulPlugin
func statefulPlugin(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Acquire lock to safely modify state
globalState.mu.Lock()
defer globalState.mu.Unlock()
// Increment call counter
globalState.callCount++
// Extract and store MIME type
if mimeType, ok := result["mime_type"].(string); ok {
globalState.lastMimeType = mimeType
globalState.cache[mimeType] = "processed"
}
// Ensure metadata exists
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Add state information to metadata
metadata["plugin_call_count"] = globalState.callCount
metadata["last_mime_type"] = globalState.lastMimeType
metadata["cached_types_count"] = len(globalState.cache)
metadata["plugin_version"] = "1.0.0"
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
// GetPluginStats safely retrieves the current plugin state for logging
func GetPluginStats() (int, string, []string) {
globalState.mu.Lock()
defer globalState.mu.Unlock()
callCount := globalState.callCount
lastMime := globalState.lastMimeType
mimeTypes := make([]string, 0, len(globalState.cache))
for mimeType := range globalState.cache {
mimeTypes = append(mimeTypes, mimeType)
}
return callCount, lastMime, mimeTypes
}
// ResetPluginState clears the plugin state - useful for testing
func ResetPluginState() {
globalState.mu.Lock()
defer globalState.mu.Unlock()
globalState.callCount = 0
globalState.lastMimeType = ""
globalState.cache = make(map[string]string)
}
func main() {
// Register the stateful post-processor with priority 60
if err := kreuzberg.RegisterPostProcessor("stateful_plugin", 60,
(C.PostProcessorCallback)(C.statefulPlugin)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("stateful_plugin"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
// Print final statistics
callCount, lastMime, mimeTypes := GetPluginStats()
log.Printf("Plugin Statistics:")
log.Printf(" Total calls: %d", callCount)
log.Printf(" Last MIME type: %s", lastMime)
log.Printf(" Unique MIME types processed: %d", len(mimeTypes))
if len(mimeTypes) > 0 {
log.Printf(" Processed types: %v", mimeTypes)
}
}()
// Process multiple documents to demonstrate state accumulation
files := []string{
"document1.pdf",
"document2.pdf",
"image.png",
"document3.txt",
}
for _, file := range files {
log.Printf("Processing: %s", file)
result, err := kreuzberg.ExtractFileSync(file, nil)
if err != nil {
log.Printf(" Warning: extraction failed: %v", err)
continue
}
// Parse and display metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if callCount, ok := metadata["plugin_call_count"].(float64); ok {
log.Printf(" Plugin call count: %.0f", callCount)
}
if cachedCount, ok := metadata["cached_types_count"].(float64); ok {
log.Printf(" Cached MIME types: %.0f", cachedCount)
}
}
}
}
// Demonstrate thread-safe state access
callCount, lastMime, mimeTypes := GetPluginStats()
log.Printf("\nFinal Plugin State:")
log.Printf(" Total calls: %d", callCount)
log.Printf(" Last MIME type: %s", lastMime)
log.Printf(" Processed MIME types: %v", mimeTypes)
}
```

View File

@@ -0,0 +1,55 @@
```go title="Go"
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
// Unregister a specific document extractor
if err := kreuzberg.UnregisterDocumentExtractor("custom-json-extractor"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister document extractor: %v", err)
}
}
// Unregister a specific post-processor
if err := kreuzberg.UnregisterPostProcessor("word_count"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister post processor: %v", err)
}
}
// Unregister a specific OCR backend
if err := kreuzberg.UnregisterOCRBackend("cloud-ocr"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister OCR backend: %v", err)
}
}
// Unregister a specific validator
if err := kreuzberg.UnregisterValidator("min_length_validator"); err != nil {
var validErr *kreuzberg.ValidationError
if errors.As(err, &validErr) {
log.Printf("validation error: %v", err)
} else {
log.Fatalf("unregister validator: %v", err)
}
}
fmt.Println("Plugins unregistered successfully")
}
```

View File

@@ -0,0 +1,90 @@
```go title="Go"
package main
import (
"encoding/json"
"log"
"strings"
"unsafe"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"
// wordCountProcessor adds word count metadata to extraction results
//export wordCountProcessor
func wordCountProcessor(resultJSON *C.char) *C.char {
jsonStr := C.GoString(resultJSON)
var result map[string]interface{}
if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
return C.CString("{\"error\":\"Failed to parse result JSON\"}")
}
// Extract content
content, ok := result["content"].(string)
if !ok {
return C.CString("{\"error\":\"Missing content field\"}")
}
// Count words by splitting on whitespace
words := strings.Fields(content)
wordCount := len(words)
// Ensure metadata exists
metadata, ok := result["metadata"].(map[string]interface{})
if !ok {
metadata = make(map[string]interface{})
}
// Add word count to metadata
metadata["word_count"] = wordCount
// Update result
result["metadata"] = metadata
// Serialize back to JSON
outputJSON, err := json.Marshal(result)
if err != nil {
return C.CString("{\"error\":\"Failed to serialize result\"}")
}
return C.CString(string(outputJSON))
}
func main() {
// Register the post-processor with priority 100, early stage
if err := kreuzberg.RegisterPostProcessor("word_count_processor", 100,
(C.PostProcessorCallback)(C.wordCountProcessor)); err != nil {
log.Fatalf("failed to register post-processor: %v", err)
}
defer func() {
if err := kreuzberg.UnregisterPostProcessor("word_count_processor"); err != nil {
log.Printf("warning: failed to unregister post-processor: %v", err)
}
}()
// Extract document
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
// Access word count from metadata
var metadata map[string]interface{}
if metaJSON, ok := result.MetadataJSON.(string); ok {
if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
if wordCount, ok := metadata["word_count"].(float64); ok {
log.Printf("Word count: %.0f", wordCount)
}
}
}
}
```

View File

@@ -0,0 +1,38 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 1000
maxOverlap := 200
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for i, chunk := range result.Chunks {
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
```

View File

@@ -0,0 +1,45 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 500
maxOverlap := 50
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: "balanced",
Normalize: true,
},
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for i, chunk := range result.Chunks {
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
fmt.Printf("Content: %s...\n", chunk.Content[:min(len(chunk.Content), 100)])
if chunk.Embedding != nil {
fmt.Printf("Embedding: %d dimensions\n", len(chunk.Embedding))
}
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
```

View File

@@ -0,0 +1,32 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 1024
maxOverlap := 100
batchSize := int32(32)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: "balanced",
Normalize: true,
BatchSize: &batchSize,
ShowDownloadProgress: false,
},
},
}
fmt.Printf("Config: MaxChars=%d, MaxOverlap=%d, Model=%s\n",
*config.Chunking.MaxChars,
*config.Chunking.MaxOverlap,
config.Chunking.Embedding.Model)
}
```

View File

@@ -0,0 +1,29 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
if keywords, ok := result.Metadata.Additional["keywords"]; ok {
fmt.Printf("Keywords: %v\n", keywords)
}
}
```

View File

@@ -0,0 +1,33 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: true,
}
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
qualityScore := 0.0
if result.QualityScore != nil {
qualityScore = *result.QualityScore
}
if qualityScore < 0.5 {
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
} else {
fmt.Printf("Quality score: %.2f\n", qualityScore)
}
}
```

View File

@@ -0,0 +1,36 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preset := "balanced"
normalize := true
config := kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: &preset,
},
Normalize: &normalize,
}
// Synchronous
embeddings, err := kreuzberg.EmbedTexts([]string{"Hello, world!", "Kreuzberg is fast"}, config)
if err != nil {
panic(err)
}
fmt.Println(len(embeddings)) // 2
fmt.Println(len(embeddings[0])) // 768
// Asynchronous
embeddings, err = kreuzberg.EmbedTextsAsync([]string{"Hello, world!"}, config)
if err != nil {
panic(err)
}
fmt.Println(len(embeddings[0])) // 768
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preserve := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveImportantWords: &preserve,
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,28 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveMarkdown: true,
},
}
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Printf("Original tokens: %v\n", result.Metadata.Additional["original_token_count"])
fmt.Printf("Reduced tokens: %v\n", result.Metadata.Additional["token_count"])
fmt.Printf("Reduction ratio: %v\n", result.Metadata.Additional["token_reduction_ratio"])
}
```