100 lines
3.2 KiB
C#
100 lines
3.2 KiB
C#
|
|
using Kreuzberg.Config;
|
||
|
|
using Kreuzberg;
|
||
|
|
|
||
|
|
public class HierarchyConfigExample
|
||
|
|
{
|
||
|
|
public static void Main()
|
||
|
|
{
|
||
|
|
// Example 1: Basic hierarchy extraction
|
||
|
|
// Enabled with default KClusters=6 for standard H1-H6 heading hierarchy.
|
||
|
|
// Extract bounding box information for spatial layout awareness.
|
||
|
|
var hierarchyConfigBasic = new HierarchyConfig
|
||
|
|
{
|
||
|
|
Enabled = true,
|
||
|
|
KClusters = 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||
|
|
IncludeBbox = true, // Include bounding box coordinates
|
||
|
|
OcrCoverageThreshold = null // No OCR coverage threshold
|
||
|
|
};
|
||
|
|
|
||
|
|
var pdfConfigBasic = new PdfConfig
|
||
|
|
{
|
||
|
|
Hierarchy = hierarchyConfigBasic
|
||
|
|
};
|
||
|
|
|
||
|
|
var extractionConfigBasic = new ExtractionConfig
|
||
|
|
{
|
||
|
|
PdfOptions = pdfConfigBasic
|
||
|
|
};
|
||
|
|
|
||
|
|
var kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||
|
|
// var result = kreuzberg.ExtractFileSync("document.pdf");
|
||
|
|
|
||
|
|
|
||
|
|
// Example 2: Custom KClusters for minimal structure
|
||
|
|
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||
|
|
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||
|
|
var hierarchyConfigMinimal = new HierarchyConfig
|
||
|
|
{
|
||
|
|
Enabled = true,
|
||
|
|
KClusters = 3, // Minimal clustering: just 3 levels
|
||
|
|
IncludeBbox = true,
|
||
|
|
OcrCoverageThreshold = null
|
||
|
|
};
|
||
|
|
|
||
|
|
var pdfConfigMinimal = new PdfConfig
|
||
|
|
{
|
||
|
|
Hierarchy = hierarchyConfigMinimal
|
||
|
|
};
|
||
|
|
|
||
|
|
var extractionConfigMinimal = new ExtractionConfig
|
||
|
|
{
|
||
|
|
PdfOptions = pdfConfigMinimal
|
||
|
|
};
|
||
|
|
|
||
|
|
|
||
|
|
// Example 3: With OCR coverage threshold
|
||
|
|
// Trigger OCR if less than 50% of text has font data.
|
||
|
|
// Useful for documents with mixed digital and scanned content.
|
||
|
|
var hierarchyConfigOcr = new HierarchyConfig
|
||
|
|
{
|
||
|
|
Enabled = true,
|
||
|
|
KClusters = 6,
|
||
|
|
IncludeBbox = true,
|
||
|
|
OcrCoverageThreshold = 0.5f // Trigger OCR if text coverage < 50%
|
||
|
|
};
|
||
|
|
|
||
|
|
var pdfConfigOcr = new PdfConfig
|
||
|
|
{
|
||
|
|
Hierarchy = hierarchyConfigOcr
|
||
|
|
};
|
||
|
|
|
||
|
|
var extractionConfigOcr = new ExtractionConfig
|
||
|
|
{
|
||
|
|
PdfOptions = pdfConfigOcr
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Field descriptions:
|
||
|
|
//
|
||
|
|
// Enabled: bool (default: true)
|
||
|
|
// - Enable or disable hierarchy extraction
|
||
|
|
// - When false, hierarchy structure is not analyzed
|
||
|
|
//
|
||
|
|
// KClusters: int (default: 6, valid: 1-7)
|
||
|
|
// - Number of font size clusters for hierarchy levels
|
||
|
|
// - 6 provides H1-H6 heading levels with body text
|
||
|
|
// - Higher values create more fine-grained hierarchy
|
||
|
|
// - Lower values create simpler structure
|
||
|
|
//
|
||
|
|
// IncludeBbox: bool (default: true)
|
||
|
|
// - Include bounding box coordinates in hierarchy blocks
|
||
|
|
// - Required for spatial layout awareness and document structure
|
||
|
|
// - Set to false only if space optimization is critical
|
||
|
|
//
|
||
|
|
// OcrCoverageThreshold: float? (default: null)
|
||
|
|
// - Range: 0.0 to 1.0
|
||
|
|
// - Triggers OCR when text block coverage falls below this fraction
|
||
|
|
// - Example: 0.5f means "run OCR if less than 50% of page has text data"
|
||
|
|
// - null means no OCR coverage-based triggering
|