fil/docs/snippets/ruby/config/hierarchy_config.rb

require 'kreuzberg'

# Example 1: Basic hierarchy extraction
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
# Extract bounding box information for spatial layout awareness.
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
  enabled: true,
  k_clusters: 6,  # Default: creates 6 font size clusters (H1-H6 structure)
  include_bbox: true,  # Include bounding box coordinates
  ocr_coverage_threshold: nil  # No OCR coverage threshold
)

pdf_config_basic = Kreuzberg::PdfConfig.new(
  hierarchy: hierarchy_config_basic
)

extraction_config_basic = Kreuzberg::ExtractionConfig.new(
  pdf_options: pdf_config_basic
)

# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)


# Example 2: Custom k_clusters for minimal structure
# Use 3 clusters for simpler hierarchy with minimal structure.
# Useful when you only need major section divisions (Main, Subsection, Detail).
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
  enabled: true,
  k_clusters: 3,  # Minimal clustering: just 3 levels
  include_bbox: true,
  ocr_coverage_threshold: nil
)

pdf_config_minimal = Kreuzberg::PdfConfig.new(
  hierarchy: hierarchy_config_minimal
)

extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
  pdf_options: pdf_config_minimal
)

# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)


# Example 3: With OCR coverage threshold
# Trigger OCR if less than 50% of text has font data.
# Useful for documents with mixed digital and scanned content.
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
  enabled: true,
  k_clusters: 6,
  include_bbox: true,
  ocr_coverage_threshold: 0.5  # Trigger OCR if text coverage < 50%
)

pdf_config_ocr = Kreuzberg::PdfConfig.new(
  hierarchy: hierarchy_config_ocr
)

extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
  pdf_options: pdf_config_ocr
)

# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)


# Field descriptions:
#
# enabled: boolean (default: true)
#   - Enable or disable hierarchy extraction
#   - When false, hierarchy structure is not analyzed
#
# k_clusters: integer (default: 6, valid: 1-7)
#   - Number of font size clusters for hierarchy levels
#   - 6 provides H1-H6 heading levels with body text
#   - Higher values create more fine-grained hierarchy
#   - Lower values create simpler structure
#
# include_bbox: boolean (default: true)
#   - Include bounding box coordinates in hierarchy blocks
#   - Required for spatial layout awareness and document structure
#   - Set to false only if space optimization is critical
#
# ocr_coverage_threshold: float | nil (default: nil)
#   - Range: 0.0 to 1.0
#   - Triggers OCR when text block coverage falls below this fraction
#   - Example: 0.5 means "run OCR if less than 50% of page has text data"
#   - nil means no OCR coverage-based triggering