Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
maxOverlap := uint(100)
useCache := true
enableQuality := true
languageDetectionEnabled := true
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+deu",
},
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &maxOverlap,
},
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &languageDetectionEnabled,
DetectMultiple: true,
},
UseCache: &useCache,
EnableQualityProcessing: &enableQuality,
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Access chunks
if len(result.Chunks) > 0 {
snippet := result.Chunks[0].Content
if len(snippet) > 100 {
snippet = snippet[:100]
}
fmt.Printf("First chunk: %s...\n", snippet)
}
// Access detected languages
if len(result.DetectedLanguages) > 0 {
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
}
}
```

View File

@@ -0,0 +1,103 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
overlap := uint(200)
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
fmt.Printf("Config: MaxCharacters=%d, Overlap=%d\n",
*config.Chunking.MaxCharacters, *config.Chunking.Overlap)
}
```
```go title="Go - Markdown with Heading Context"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(500)
overlap := uint(50)
model := "Xenova/gpt-4o"
chunkerType := kreuzberg.ChunkerTypeMarkdown
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
ChunkerType: &chunkerType,
Sizing: kreuzberg.ChunkSizing{
Type: "tokenizer",
Model: &model,
},
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for _, chunk := range result.Chunks {
if chunk.Metadata.HeadingContext != nil {
for _, heading := range chunk.Metadata.HeadingContext.Headings {
fmt.Printf("Heading L%d: %s\n", heading.Level, heading.Text)
}
}
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
```
```go title="Go - Prepend Heading Context"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(500)
overlap := uint(50)
chunkerType := kreuzberg.ChunkerTypeMarkdown
config := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
ChunkerType: &chunkerType,
PrependHeadingContext: true,
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for _, chunk := range result.Chunks {
// Each chunk's content is prefixed with its heading breadcrumb
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
```

View File

@@ -0,0 +1,24 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
useCache := true
enableQP := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("kreuzberg.toml")
if err != nil {
log.Fatalf("load config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Detected MIME: %s", result.MimeType)
}
```

View File

@@ -0,0 +1,19 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
psm := int32(3)
_ = kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+fra",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
},
},
}
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
psm := int32(6)
maxChars := uint(1000)
overlap := uint(200)
useCache := true
config := kreuzberg.ExtractionConfig{
UseCache: &useCache,
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
},
},
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Overlap: &overlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
```

View File

@@ -0,0 +1,25 @@
```go title="Document Structure Config (Go)"
package main
import (
"fmt"
kreuzberg "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := kreuzberg.NewExtractionConfig(
kreuzberg.WithIncludeDocumentStructure(true),
)
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
if result.Document != nil {
for _, node := range result.Document.Nodes {
fmt.Printf("[%s]\n", node.Content.NodeType)
}
}
}
```

View File

@@ -0,0 +1,60 @@
```go title="Element-Based Output (Go)"
package main
import (
"fmt"
"kreuzberg"
)
func main() {
// Configure element-based output
config := &kreuzberg.ExtractionConfig{
OutputFormat: "element_based",
}
// Extract document
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
// Access elements
for _, element := range result.Elements {
fmt.Printf("Type: %s\n", element.ElementType)
text := element.Text
if len(text) > 100 {
text = text[:100]
}
fmt.Printf("Text: %s\n", text)
if element.Metadata.PageNumber != nil {
fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
}
if element.Metadata.Coordinates != nil {
coords := element.Metadata.Coordinates
fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
coords.Left, coords.Top, coords.Right, coords.Bottom)
}
fmt.Println("---")
}
// Filter by element type
var titles []kreuzberg.Element
for _, element := range result.Elements {
if element.ElementType == "title" {
titles = append(titles, element)
}
}
for _, title := range titles {
level, ok := title.Metadata.Additional["level"].(string)
if !ok {
level = "unknown"
}
fmt.Printf("[%s] %s\n", level, title.Text)
}
}
```

View File

@@ -0,0 +1,100 @@
package main
import (
"kreuzberg"
)
func main() {
// Example 1: Preset model (recommended)
// Fast, balanced, or quality preset configurations optimized for common use cases.
embeddingConfig := kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: "balanced",
},
BatchSize: 32,
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "~/.cache/kreuzberg/embeddings",
}
// Available presets:
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
// - "quality" (1024 dims): Complex documents, maximum accuracy
// - "multilingual" (768 dims): International documents, 100+ languages
// Example 2: Custom ONNX model (requires embeddings feature)
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
embeddingConfig = kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "custom",
ModelID: "BAAI/bge-small-en-v1.5",
Dimensions: 384,
},
BatchSize: 32,
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "", // Uses default: .kreuzberg/embeddings/
}
// Popular ONNX-compatible models:
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
// Example 3: Alternative Custom ONNX Model
// For advanced users wanting different ONNX embedding models.
embeddingConfig = kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "custom",
ModelID: "sentence-transformers/all-mpnet-base-v2",
Dimensions: 768,
},
BatchSize: 16, // Larger model requires smaller batch size
Normalize: true,
ShowDownloadProgress: true,
CacheDir: "/var/cache/embeddings",
}
// Integration with ChunkingConfig
// Add embeddings to your chunking configuration:
chunkingConfig := kreuzberg.ChunkingConfig{
MaxChars: 1024,
MaxOverlap: 100,
Preset: "balanced",
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: "balanced",
},
BatchSize: 32,
Normalize: true,
},
}
extractionConfig := kreuzberg.ExtractionConfig{
Chunking: &chunkingConfig,
}
_ = embeddingConfig
_ = extractionConfig
}
// Key parameter explanations:
//
// BatchSize: Number of texts to embed at once (32-128 typical)
// - Larger batches are faster but use more memory
// - Smaller batches for resource-constrained environments
//
// Normalize: Whether to normalize vectors (L2 norm)
// - true (recommended): Enables cosine similarity in vector DBs
// - false: Raw embedding values
//
// CacheDir: Where to store downloaded models
// - Empty string: Uses .kreuzberg/embeddings/ in current directory
// - Non-empty: Custom directory for model storage
//
// ShowDownloadProgress: Display download progress bar
// - Useful for monitoring large model downloads

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := uint(1000)
batchSize := uint(16)
normalize := true
modelName := "all-mpnet-base-v2"
cfg := kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxCharacters: &maxChars,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType{
Type: "preset",
Name: &modelName,
},
BatchSize: &batchSize,
Normalize: &normalize,
ShowDownloadProgress: true,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,94 @@
package main
import (
"kreuzberg"
)
func main() {
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
OcrCoverageThreshold: nil, // No OCR coverage threshold
}
pdfConfigBasic := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigBasic,
}
extractionConfigBasic := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigBasic,
}
// Use with ExtractFileSync or ExtractBytesSync
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: nil,
}
pdfConfigMinimal := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigMinimal,
}
extractionConfigMinimal := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigMinimal,
}
_ = extractionConfigMinimal
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
ocrThreshold := 0.5
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6),
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
}
pdfConfigOcr := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigOcr,
}
extractionConfigOcr := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigOcr,
}
_ = extractionConfigOcr
}
// Field descriptions:
//
// Enabled: *bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: *int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: *bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: *float64 (default: nil)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - nil means no OCR coverage-based triggering

View File

@@ -0,0 +1,31 @@
```go title="Go"
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
outputFormat := kreuzberg.OutputFormatHTML
theme := kreuzberg.HTMLThemeGitHub
embedCSS := true
config := &kreuzberg.ExtractionConfig{
OutputFormat: &outputFormat,
HTMLOutput: &kreuzberg.HTMLOutputConfig{
Theme: &theme,
EmbedCSS: &embedCSS,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content) // HTML with kb-* classes
}
```

View File

@@ -0,0 +1,89 @@
package main
import (
"fmt"
"kreuzberg"
)
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
func basicYake() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "yake",
MaxKeywords: 10,
MinScore: 0.0,
NgramRange: [2]int{1, 3},
Language: "en",
YakeParams: nil,
RakeParams: nil,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
func advancedYake() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "yake",
MaxKeywords: 15,
MinScore: 0.1,
NgramRange: [2]int{1, 2},
Language: "en",
YakeParams: &kreuzberg.YakeParams{
WindowSize: 1,
},
RakeParams: nil,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
func rakeConfig() error {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "rake",
MaxKeywords: 10,
MinScore: 5.0,
NgramRange: [2]int{1, 3},
Language: "en",
YakeParams: nil,
RakeParams: &kreuzberg.RakeParams{
MinWordLength: 1,
MaxWordsPerPhrase: 3,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
return err
}
fmt.Printf("Keywords: %v\n", result.Keywords)
return nil
}
func main() {
if err := basicYake(); err != nil {
fmt.Println("Error:", err)
}
}

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
NgramRange: "1,3",
Language: "en",
},
}
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
config.Keywords.Algorithm,
config.Keywords.MaxKeywords,
config.Keywords.MinScore)
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: true,
MinConfidence: &minConfidence,
DetectMultiple: false,
},
}
fmt.Printf("Language detection enabled: %v\n", config.LanguageDetection.Enabled)
fmt.Printf("Min confidence: %f\n", *config.LanguageDetection.MinConfidence)
}
```

View File

@@ -0,0 +1,28 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
targetDpi := int32(300)
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
TesseractConfig: &kreuzberg.TesseractConfig{
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
TargetDpi: &targetDpi,
},
},
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,26 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
extractMetadata := true
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
ExtractMetadata: &extractMetadata,
Passwords: []string{"password1", "password2"},
Hierarchy: &kreuzberg.HierarchyConfig{},
},
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,42 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
enabled := true
includeBbox := true
kClusters := uint(6)
kClustersAdvanced := uint(12)
threshold := float32(0.8)
// Basic hierarchy configuration
config := kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: &enabled,
KClusters: &kClusters,
IncludeBbox: &includeBbox,
OcrCoverageThreshold: &threshold,
},
},
}
// Advanced hierarchy configuration with more clusters
advancedConfig := kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: &enabled,
KClusters: &kClustersAdvanced,
IncludeBbox: &includeBbox,
OcrCoverageThreshold: &threshold,
},
},
}
_ = config
_ = advancedConfig
}
```

View File

@@ -0,0 +1,18 @@
```go title="Go"
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
func main() {
enabled := true
cfg := &kreuzberg.ExtractionConfig{
Postprocessor: &kreuzberg.PostProcessorConfig{
Enabled: &enabled,
EnabledProcessors: []string{"deduplication", "whitespace_normalization"},
DisabledProcessors: []string{"mojibake_fix"},
},
}
_ = cfg
}
```

View File

@@ -0,0 +1,17 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: true, // Default
}
fmt.Printf("Quality processing enabled: %v\n", config.EnableQualityProcessing)
}
```

View File

@@ -0,0 +1,37 @@
```go title="Go"
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
psm := int32(6)
oem := int32(1)
enableTableDetection := true
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
config := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng+fra+deu",
TesseractConfig: &kreuzberg.TesseractConfig{
Psm: &psm,
Oem: &oem,
MinConfidence: 0.8,
EnableTableDetection: &enableTableDetection,
TesseditCharWhitelist: whitelist,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
```

View File

@@ -0,0 +1,23 @@
```go title="Go"
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
preserveImportant := true
config := kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionOptions{
Mode: "moderate",
PreserveImportantWords: &preserveImportant,
},
}
fmt.Printf("Mode: %s, Preserve Important Words: %v\n",
config.TokenReduction.Mode,
*config.TokenReduction.PreserveImportantWords)
}
```