88 lines
2.8 KiB
Ruby
88 lines
2.8 KiB
Ruby
require 'kreuzberg'
|
|
|
|
# Example 1: Basic hierarchy extraction
|
|
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
|
# Extract bounding box information for spatial layout awareness.
|
|
hierarchy_config_basic = Kreuzberg::HierarchyConfig.new(
|
|
enabled: true,
|
|
k_clusters: 6, # Default: creates 6 font size clusters (H1-H6 structure)
|
|
include_bbox: true, # Include bounding box coordinates
|
|
ocr_coverage_threshold: nil # No OCR coverage threshold
|
|
)
|
|
|
|
pdf_config_basic = Kreuzberg::PdfConfig.new(
|
|
hierarchy: hierarchy_config_basic
|
|
)
|
|
|
|
extraction_config_basic = Kreuzberg::ExtractionConfig.new(
|
|
pdf_options: pdf_config_basic
|
|
)
|
|
|
|
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_basic)
|
|
|
|
|
|
# Example 2: Custom k_clusters for minimal structure
|
|
# Use 3 clusters for simpler hierarchy with minimal structure.
|
|
# Useful when you only need major section divisions (Main, Subsection, Detail).
|
|
hierarchy_config_minimal = Kreuzberg::HierarchyConfig.new(
|
|
enabled: true,
|
|
k_clusters: 3, # Minimal clustering: just 3 levels
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: nil
|
|
)
|
|
|
|
pdf_config_minimal = Kreuzberg::PdfConfig.new(
|
|
hierarchy: hierarchy_config_minimal
|
|
)
|
|
|
|
extraction_config_minimal = Kreuzberg::ExtractionConfig.new(
|
|
pdf_options: pdf_config_minimal
|
|
)
|
|
|
|
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_minimal)
|
|
|
|
|
|
# Example 3: With OCR coverage threshold
|
|
# Trigger OCR if less than 50% of text has font data.
|
|
# Useful for documents with mixed digital and scanned content.
|
|
hierarchy_config_ocr = Kreuzberg::HierarchyConfig.new(
|
|
enabled: true,
|
|
k_clusters: 6,
|
|
include_bbox: true,
|
|
ocr_coverage_threshold: 0.5 # Trigger OCR if text coverage < 50%
|
|
)
|
|
|
|
pdf_config_ocr = Kreuzberg::PdfConfig.new(
|
|
hierarchy: hierarchy_config_ocr
|
|
)
|
|
|
|
extraction_config_ocr = Kreuzberg::ExtractionConfig.new(
|
|
pdf_options: pdf_config_ocr
|
|
)
|
|
|
|
# result = Kreuzberg.extract_file_sync("document.pdf", config: extraction_config_ocr)
|
|
|
|
|
|
# Field descriptions:
|
|
#
|
|
# enabled: boolean (default: true)
|
|
# - Enable or disable hierarchy extraction
|
|
# - When false, hierarchy structure is not analyzed
|
|
#
|
|
# k_clusters: integer (default: 6, valid: 1-7)
|
|
# - Number of font size clusters for hierarchy levels
|
|
# - 6 provides H1-H6 heading levels with body text
|
|
# - Higher values create more fine-grained hierarchy
|
|
# - Lower values create simpler structure
|
|
#
|
|
# include_bbox: boolean (default: true)
|
|
# - Include bounding box coordinates in hierarchy blocks
|
|
# - Required for spatial layout awareness and document structure
|
|
# - Set to false only if space optimization is critical
|
|
#
|
|
# ocr_coverage_threshold: float | nil (default: nil)
|
|
# - Range: 0.0 to 1.0
|
|
# - Triggers OCR when text block coverage falls below this fraction
|
|
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
|
# - nil means no OCR coverage-based triggering
|