This commit is contained in:
54
docs/snippets/go/config/advanced_config.md
Normal file
54
docs/snippets/go/config/advanced_config.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
maxOverlap := uint(100)
|
||||
useCache := true
|
||||
enableQuality := true
|
||||
languageDetectionEnabled := true
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+deu",
|
||||
},
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &maxOverlap,
|
||||
},
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: &languageDetectionEnabled,
|
||||
DetectMultiple: true,
|
||||
},
|
||||
UseCache: &useCache,
|
||||
EnableQualityProcessing: &enableQuality,
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
// Access chunks
|
||||
if len(result.Chunks) > 0 {
|
||||
snippet := result.Chunks[0].Content
|
||||
if len(snippet) > 100 {
|
||||
snippet = snippet[:100]
|
||||
}
|
||||
fmt.Printf("First chunk: %s...\n", snippet)
|
||||
}
|
||||
|
||||
// Access detected languages
|
||||
if len(result.DetectedLanguages) > 0 {
|
||||
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
|
||||
}
|
||||
}
|
||||
```
|
||||
103
docs/snippets/go/config/chunking_config.md
Normal file
103
docs/snippets/go/config/chunking_config.md
Normal file
@@ -0,0 +1,103 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
overlap := uint(200)
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Config: MaxCharacters=%d, Overlap=%d\n",
|
||||
*config.Chunking.MaxCharacters, *config.Chunking.Overlap)
|
||||
}
|
||||
```
|
||||
|
||||
```go title="Go - Markdown with Heading Context"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
model := "Xenova/gpt-4o"
|
||||
chunkerType := kreuzberg.ChunkerTypeMarkdown
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
ChunkerType: &chunkerType,
|
||||
Sizing: kreuzberg.ChunkSizing{
|
||||
Type: "tokenizer",
|
||||
Model: &model,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFile("document.md", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
if chunk.Metadata.HeadingContext != nil {
|
||||
for _, heading := range chunk.Metadata.HeadingContext.Headings {
|
||||
fmt.Printf("Heading L%d: %s\n", heading.Level, heading.Text)
|
||||
}
|
||||
}
|
||||
fmt.Printf("Content: %.100s...\n", chunk.Content)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```go title="Go - Prepend Heading Context"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(500)
|
||||
overlap := uint(50)
|
||||
chunkerType := kreuzberg.ChunkerTypeMarkdown
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
ChunkerType: &chunkerType,
|
||||
PrependHeadingContext: true,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFile("document.md", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
for _, chunk := range result.Chunks {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
fmt.Printf("Content: %.100s...\n", chunk.Content)
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/go/config/config_basic.md
Normal file
24
docs/snippets/go/config/config_basic.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
useCache := true
|
||||
enableQP := true
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
EnableQualityProcessing: &enableQP,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/config_discover.md
Normal file
23
docs/snippets/go/config/config_discover.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config, err := kreuzberg.LoadExtractionConfigFromFile("")
|
||||
if err != nil {
|
||||
log.Fatalf("discover config failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/config_file.md
Normal file
23
docs/snippets/go/config/config_file.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config, err := kreuzberg.LoadExtractionConfigFromFile("kreuzberg.toml")
|
||||
if err != nil {
|
||||
log.Fatalf("load config failed: %v", err)
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Detected MIME: %s", result.MimeType)
|
||||
}
|
||||
```
|
||||
19
docs/snippets/go/config/config_ocr.md
Normal file
19
docs/snippets/go/config/config_ocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
psm := int32(3)
|
||||
|
||||
_ = kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+fra",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/config/config_programmatic.md
Normal file
37
docs/snippets/go/config/config_programmatic.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
psm := int32(6)
|
||||
maxChars := uint(1000)
|
||||
overlap := uint(200)
|
||||
useCache := true
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
UseCache: &useCache,
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
},
|
||||
},
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Overlap: &overlap,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Content length: %d", len(result.Content))
|
||||
}
|
||||
```
|
||||
25
docs/snippets/go/config/document_structure_config.md
Normal file
25
docs/snippets/go/config/document_structure_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```go title="Document Structure Config (Go)"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
kreuzberg "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := kreuzberg.NewExtractionConfig(
|
||||
kreuzberg.WithIncludeDocumentStructure(true),
|
||||
)
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if result.Document != nil {
|
||||
for _, node := range result.Document.Nodes {
|
||||
fmt.Printf("[%s]\n", node.Content.NodeType)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/go/config/element_based_output.md
Normal file
60
docs/snippets/go/config/element_based_output.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```go title="Element-Based Output (Go)"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Configure element-based output
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
OutputFormat: "element_based",
|
||||
}
|
||||
|
||||
// Extract document
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Access elements
|
||||
for _, element := range result.Elements {
|
||||
fmt.Printf("Type: %s\n", element.ElementType)
|
||||
|
||||
text := element.Text
|
||||
if len(text) > 100 {
|
||||
text = text[:100]
|
||||
}
|
||||
fmt.Printf("Text: %s\n", text)
|
||||
|
||||
if element.Metadata.PageNumber != nil {
|
||||
fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
|
||||
}
|
||||
|
||||
if element.Metadata.Coordinates != nil {
|
||||
coords := element.Metadata.Coordinates
|
||||
fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
|
||||
coords.Left, coords.Top, coords.Right, coords.Bottom)
|
||||
}
|
||||
|
||||
fmt.Println("---")
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
var titles []kreuzberg.Element
|
||||
for _, element := range result.Elements {
|
||||
if element.ElementType == "title" {
|
||||
titles = append(titles, element)
|
||||
}
|
||||
}
|
||||
|
||||
for _, title := range titles {
|
||||
level, ok := title.Metadata.Additional["level"].(string)
|
||||
if !ok {
|
||||
level = "unknown"
|
||||
}
|
||||
fmt.Printf("[%s] %s\n", level, title.Text)
|
||||
}
|
||||
}
|
||||
```
|
||||
100
docs/snippets/go/config/embedding_config.go
Normal file
100
docs/snippets/go/config/embedding_config.go
Normal file
@@ -0,0 +1,100 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Example 1: Preset model (recommended)
|
||||
// Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embeddingConfig := kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: "balanced",
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "~/.cache/kreuzberg/embeddings",
|
||||
}
|
||||
|
||||
// Available presets:
|
||||
// - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
// - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
// - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
// - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
// Example 2: Custom ONNX model (requires embeddings feature)
|
||||
// Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embeddingConfig = kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "custom",
|
||||
ModelID: "BAAI/bge-small-en-v1.5",
|
||||
Dimensions: 384,
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "", // Uses default: .kreuzberg/embeddings/
|
||||
}
|
||||
|
||||
// Popular ONNX-compatible models:
|
||||
// - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
// - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
// - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
// - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
// Example 3: Alternative Custom ONNX Model
|
||||
// For advanced users wanting different ONNX embedding models.
|
||||
embeddingConfig = kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "custom",
|
||||
ModelID: "sentence-transformers/all-mpnet-base-v2",
|
||||
Dimensions: 768,
|
||||
},
|
||||
BatchSize: 16, // Larger model requires smaller batch size
|
||||
Normalize: true,
|
||||
ShowDownloadProgress: true,
|
||||
CacheDir: "/var/cache/embeddings",
|
||||
}
|
||||
|
||||
// Integration with ChunkingConfig
|
||||
// Add embeddings to your chunking configuration:
|
||||
chunkingConfig := kreuzberg.ChunkingConfig{
|
||||
MaxChars: 1024,
|
||||
MaxOverlap: 100,
|
||||
Preset: "balanced",
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: "balanced",
|
||||
},
|
||||
BatchSize: 32,
|
||||
Normalize: true,
|
||||
},
|
||||
}
|
||||
|
||||
extractionConfig := kreuzberg.ExtractionConfig{
|
||||
Chunking: &chunkingConfig,
|
||||
}
|
||||
|
||||
_ = embeddingConfig
|
||||
_ = extractionConfig
|
||||
}
|
||||
|
||||
// Key parameter explanations:
|
||||
//
|
||||
// BatchSize: Number of texts to embed at once (32-128 typical)
|
||||
// - Larger batches are faster but use more memory
|
||||
// - Smaller batches for resource-constrained environments
|
||||
//
|
||||
// Normalize: Whether to normalize vectors (L2 norm)
|
||||
// - true (recommended): Enables cosine similarity in vector DBs
|
||||
// - false: Raw embedding values
|
||||
//
|
||||
// CacheDir: Where to store downloaded models
|
||||
// - Empty string: Uses .kreuzberg/embeddings/ in current directory
|
||||
// - Non-empty: Custom directory for model storage
|
||||
//
|
||||
// ShowDownloadProgress: Display download progress bar
|
||||
// - Useful for monitoring large model downloads
|
||||
37
docs/snippets/go/config/embedding_config.md
Normal file
37
docs/snippets/go/config/embedding_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
maxChars := uint(1000)
|
||||
batchSize := uint(16)
|
||||
normalize := true
|
||||
modelName := "all-mpnet-base-v2"
|
||||
|
||||
cfg := kreuzberg.ExtractionConfig{
|
||||
Chunking: &kreuzberg.ChunkingConfig{
|
||||
MaxCharacters: &maxChars,
|
||||
Embedding: &kreuzberg.EmbeddingConfig{
|
||||
Model: kreuzberg.EmbeddingModelType{
|
||||
Type: "preset",
|
||||
Name: &modelName,
|
||||
},
|
||||
BatchSize: &batchSize,
|
||||
Normalize: &normalize,
|
||||
ShowDownloadProgress: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
94
docs/snippets/go/config/hierarchy_config.go
Normal file
94
docs/snippets/go/config/hierarchy_config.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
|
||||
OcrCoverageThreshold: nil, // No OCR coverage threshold
|
||||
}
|
||||
|
||||
pdfConfigBasic := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigBasic,
|
||||
}
|
||||
|
||||
extractionConfigBasic := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigBasic,
|
||||
}
|
||||
|
||||
// Use with ExtractFileSync or ExtractBytesSync
|
||||
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
|
||||
|
||||
|
||||
// Example 2: Custom KClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: nil,
|
||||
}
|
||||
|
||||
pdfConfigMinimal := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigMinimal,
|
||||
}
|
||||
|
||||
extractionConfigMinimal := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigMinimal,
|
||||
}
|
||||
|
||||
_ = extractionConfigMinimal
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
ocrThreshold := 0.5
|
||||
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6),
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
|
||||
}
|
||||
|
||||
pdfConfigOcr := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigOcr,
|
||||
}
|
||||
|
||||
extractionConfigOcr := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigOcr,
|
||||
}
|
||||
|
||||
_ = extractionConfigOcr
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// Enabled: *bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// KClusters: *int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// IncludeBbox: *bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// OcrCoverageThreshold: *float64 (default: nil)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - nil means no OCR coverage-based triggering
|
||||
31
docs/snippets/go/config/html_output.md
Normal file
31
docs/snippets/go/config/html_output.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
outputFormat := kreuzberg.OutputFormatHTML
|
||||
theme := kreuzberg.HTMLThemeGitHub
|
||||
embedCSS := true
|
||||
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
OutputFormat: &outputFormat,
|
||||
HTMLOutput: &kreuzberg.HTMLOutputConfig{
|
||||
Theme: &theme,
|
||||
EmbedCSS: &embedCSS,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
fmt.Println(result.Content) // HTML with kb-* classes
|
||||
}
|
||||
```
|
||||
89
docs/snippets/go/config/keyword_config.go
Normal file
89
docs/snippets/go/config/keyword_config.go
Normal file
@@ -0,0 +1,89 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
func basicYake() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "yake",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.0,
|
||||
NgramRange: [2]int{1, 3},
|
||||
Language: "en",
|
||||
YakeParams: nil,
|
||||
RakeParams: nil,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
func advancedYake() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "yake",
|
||||
MaxKeywords: 15,
|
||||
MinScore: 0.1,
|
||||
NgramRange: [2]int{1, 2},
|
||||
Language: "en",
|
||||
YakeParams: &kreuzberg.YakeParams{
|
||||
WindowSize: 1,
|
||||
},
|
||||
RakeParams: nil,
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
func rakeConfig() error {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "rake",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 5.0,
|
||||
NgramRange: [2]int{1, 3},
|
||||
Language: "en",
|
||||
YakeParams: nil,
|
||||
RakeParams: &kreuzberg.RakeParams{
|
||||
MinWordLength: 1,
|
||||
MaxWordsPerPhrase: 3,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords: %v\n", result.Keywords)
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
if err := basicYake(); err != nil {
|
||||
fmt.Println("Error:", err)
|
||||
}
|
||||
}
|
||||
26
docs/snippets/go/config/keyword_extraction_config.md
Normal file
26
docs/snippets/go/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
Keywords: &kreuzberg.KeywordConfig{
|
||||
Algorithm: "YAKE",
|
||||
MaxKeywords: 10,
|
||||
MinScore: 0.3,
|
||||
NgramRange: "1,3",
|
||||
Language: "en",
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
|
||||
config.Keywords.Algorithm,
|
||||
config.Keywords.MaxKeywords,
|
||||
config.Keywords.MinScore)
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/language_detection_config.md
Normal file
23
docs/snippets/go/config/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
minConfidence := 0.8
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
|
||||
Enabled: true,
|
||||
MinConfidence: &minConfidence,
|
||||
DetectMultiple: false,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Language detection enabled: %v\n", config.LanguageDetection.Enabled)
|
||||
fmt.Printf("Min confidence: %f\n", *config.LanguageDetection.MinConfidence)
|
||||
}
|
||||
```
|
||||
28
docs/snippets/go/config/ocr_dpi_config.md
Normal file
28
docs/snippets/go/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
targetDpi := int32(300)
|
||||
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Preprocessing: &kreuzberg.ImagePreprocessingConfig{
|
||||
TargetDpi: &targetDpi,
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
26
docs/snippets/go/config/pdf_config.md
Normal file
26
docs/snippets/go/config/pdf_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
extractMetadata := true
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
ExtractMetadata: &extractMetadata,
|
||||
Passwords: []string{"password1", "password2"},
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
42
docs/snippets/go/config/pdf_hierarchy_config.md
Normal file
42
docs/snippets/go/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
includeBbox := true
|
||||
kClusters := uint(6)
|
||||
kClustersAdvanced := uint(12)
|
||||
threshold := float32(0.8)
|
||||
|
||||
// Basic hierarchy configuration
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{
|
||||
Enabled: &enabled,
|
||||
KClusters: &kClusters,
|
||||
IncludeBbox: &includeBbox,
|
||||
OcrCoverageThreshold: &threshold,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Advanced hierarchy configuration with more clusters
|
||||
advancedConfig := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &kreuzberg.PdfConfig{
|
||||
ExtractImages: true,
|
||||
Hierarchy: &kreuzberg.HierarchyConfig{
|
||||
Enabled: &enabled,
|
||||
KClusters: &kClustersAdvanced,
|
||||
IncludeBbox: &includeBbox,
|
||||
OcrCoverageThreshold: &threshold,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_ = config
|
||||
_ = advancedConfig
|
||||
}
|
||||
```
|
||||
18
docs/snippets/go/config/postprocessor_config.md
Normal file
18
docs/snippets/go/config/postprocessor_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
|
||||
func main() {
|
||||
enabled := true
|
||||
cfg := &kreuzberg.ExtractionConfig{
|
||||
Postprocessor: &kreuzberg.PostProcessorConfig{
|
||||
Enabled: &enabled,
|
||||
EnabledProcessors: []string{"deduplication", "whitespace_normalization"},
|
||||
DisabledProcessors: []string{"mojibake_fix"},
|
||||
},
|
||||
}
|
||||
|
||||
_ = cfg
|
||||
}
|
||||
```
|
||||
17
docs/snippets/go/config/quality_processing_config.md
Normal file
17
docs/snippets/go/config/quality_processing_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
config := &kreuzberg.ExtractionConfig{
|
||||
EnableQualityProcessing: true, // Default
|
||||
}
|
||||
|
||||
fmt.Printf("Quality processing enabled: %v\n", config.EnableQualityProcessing)
|
||||
}
|
||||
```
|
||||
37
docs/snippets/go/config/tesseract_config.md
Normal file
37
docs/snippets/go/config/tesseract_config.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
psm := int32(6)
|
||||
oem := int32(1)
|
||||
enableTableDetection := true
|
||||
whitelist := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,!?"
|
||||
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
Ocr: &kreuzberg.OcrConfig{
|
||||
Backend: "tesseract",
|
||||
Language: "eng+fra+deu",
|
||||
TesseractConfig: &kreuzberg.TesseractConfig{
|
||||
Psm: &psm,
|
||||
Oem: &oem,
|
||||
MinConfidence: 0.8,
|
||||
EnableTableDetection: &enableTableDetection,
|
||||
TesseditCharWhitelist: whitelist,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := kreuzberg.ExtractFileSync("document.pdf", nil, config)
|
||||
if err != nil {
|
||||
log.Fatalf("extract failed: %v", err)
|
||||
}
|
||||
|
||||
log.Println("content length:", len(result.Content))
|
||||
}
|
||||
```
|
||||
23
docs/snippets/go/config/token_reduction_config.md
Normal file
23
docs/snippets/go/config/token_reduction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```go title="Go"
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
|
||||
)
|
||||
|
||||
func main() {
|
||||
preserveImportant := true
|
||||
config := kreuzberg.ExtractionConfig{
|
||||
TokenReduction: &kreuzberg.TokenReductionOptions{
|
||||
Mode: "moderate",
|
||||
PreserveImportantWords: &preserveImportant,
|
||||
},
|
||||
}
|
||||
|
||||
fmt.Printf("Mode: %s, Preserve Important Words: %v\n",
|
||||
config.TokenReduction.Mode,
|
||||
*config.TokenReduction.PreserveImportantWords)
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user