Files
fil/docs/snippets/go/config/hierarchy_config.go
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

95 lines
3.2 KiB
Go

package main
import (
"kreuzberg"
)
func main() {
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
OcrCoverageThreshold: nil, // No OCR coverage threshold
}
pdfConfigBasic := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigBasic,
}
extractionConfigBasic := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigBasic,
}
// Use with ExtractFileSync or ExtractBytesSync
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: nil,
}
pdfConfigMinimal := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigMinimal,
}
extractionConfigMinimal := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigMinimal,
}
_ = extractionConfigMinimal
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
ocrThreshold := 0.5
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
Enabled: func(b bool) *bool { return &b }(true),
KClusters: func(i int) *int { return &i }(6),
IncludeBbox: func(b bool) *bool { return &b }(true),
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
}
pdfConfigOcr := kreuzberg.PdfConfig{
Hierarchy: &hierarchyConfigOcr,
}
extractionConfigOcr := kreuzberg.ExtractionConfig{
PdfOptions: &pdfConfigOcr,
}
_ = extractionConfigOcr
}
// Field descriptions:
//
// Enabled: *bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: *int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: *bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: *float64 (default: nil)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - nil means no OCR coverage-based triggering