Files
fil/docs/snippets/python/config/pdf_hierarchy_config.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

605 B

from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig

config: ExtractionConfig = ExtractionConfig(
    pdf_options=PdfConfig(
        extract_metadata=True,
        hierarchy=HierarchyConfig(
            enabled=True,
            k_clusters=6,
            include_bbox=True,
            ocr_coverage_threshold=0.8
        )
    )
)

result = extract_file_sync("document.pdf", config=config)

# Access hierarchy information
for page in result.pages or []:
    print(f"Page {page.page_number}:")
    print(f"  Content: {page.content[:100]}...")