This commit is contained in:
94
docs/snippets/go/config/hierarchy_config.go
Normal file
94
docs/snippets/go/config/hierarchy_config.go
Normal file
@@ -0,0 +1,94 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"kreuzberg"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
hierarchyConfigBasic := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6), // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true), // Include bounding box coordinates
|
||||
OcrCoverageThreshold: nil, // No OCR coverage threshold
|
||||
}
|
||||
|
||||
pdfConfigBasic := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigBasic,
|
||||
}
|
||||
|
||||
extractionConfigBasic := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigBasic,
|
||||
}
|
||||
|
||||
// Use with ExtractFileSync or ExtractBytesSync
|
||||
// result, err := kreuzberg.ExtractFileSync("document.pdf", extractionConfigBasic)
|
||||
|
||||
|
||||
// Example 2: Custom KClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchyConfigMinimal := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(3), // Minimal clustering: just 3 levels
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: nil,
|
||||
}
|
||||
|
||||
pdfConfigMinimal := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigMinimal,
|
||||
}
|
||||
|
||||
extractionConfigMinimal := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigMinimal,
|
||||
}
|
||||
|
||||
_ = extractionConfigMinimal
|
||||
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
ocrThreshold := 0.5
|
||||
hierarchyConfigOcr := kreuzberg.HierarchyConfig{
|
||||
Enabled: func(b bool) *bool { return &b }(true),
|
||||
KClusters: func(i int) *int { return &i }(6),
|
||||
IncludeBbox: func(b bool) *bool { return &b }(true),
|
||||
OcrCoverageThreshold: &ocrThreshold, // Trigger OCR if text coverage < 50%
|
||||
}
|
||||
|
||||
pdfConfigOcr := kreuzberg.PdfConfig{
|
||||
Hierarchy: &hierarchyConfigOcr,
|
||||
}
|
||||
|
||||
extractionConfigOcr := kreuzberg.ExtractionConfig{
|
||||
PdfOptions: &pdfConfigOcr,
|
||||
}
|
||||
|
||||
_ = extractionConfigOcr
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// Enabled: *bool (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// KClusters: *int (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// IncludeBbox: *bool (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// OcrCoverageThreshold: *float64 (default: nil)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - nil means no OCR coverage-based triggering
|
||||
Reference in New Issue
Block a user