Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
using Kreuzberg.Config;
using Kreuzberg;
public class HierarchyConfigExample
{
public static void Main()
{
// Example 1: Basic hierarchy extraction
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
var hierarchyConfigBasic = new HierarchyConfig
{
Enabled = true,
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = null // No OCR coverage threshold
};
var pdfConfigBasic = new PdfConfig
{
Hierarchy = hierarchyConfigBasic
};
var extractionConfigBasic = new ExtractionConfig
{
PdfOptions = pdfConfigBasic
};
var kreuzberg = new Kreuzberg(extractionConfigBasic);
// var result = kreuzberg.ExtractFileSync("document.pdf");
// Example 2: Custom KClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
var hierarchyConfigMinimal = new HierarchyConfig
{
Enabled = true,
KClusters = 3, // Minimal clustering: just 3 levels
IncludeBbox = true,
OcrCoverageThreshold = null
};
var pdfConfigMinimal = new PdfConfig
{
Hierarchy = hierarchyConfigMinimal
};
var extractionConfigMinimal = new ExtractionConfig
{
PdfOptions = pdfConfigMinimal
};
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
var hierarchyConfigOcr = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
};
var pdfConfigOcr = new PdfConfig
{
Hierarchy = hierarchyConfigOcr
};
var extractionConfigOcr = new ExtractionConfig
{
PdfOptions = pdfConfigOcr
};
}
}
// Field descriptions:
//
// Enabled: bool (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// KClusters: int (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// IncludeBbox: bool (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// OcrCoverageThreshold: float? (default: null)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
// - null means no OCR coverage-based triggering