This commit is contained in:
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
extract_metadata=True,
|
||||
hierarchy=HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6,
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access hierarchy information
|
||||
for page in result.pages or []:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f" Content: {page.content[:100]}...")
|
||||
```
|
||||
Reference in New Issue
Block a user