Files
fil/docs/snippets/php/configuration/pdf_hierarchy_config.php

72 lines
1.9 KiB
PHP
Raw Normal View History

2026-06-01 23:40:55 +02:00
```php title="pdf_hierarchy_config.php"
<?php
declare(strict_types=1);
/**
* PdfHierarchyConfig - Hierarchy Detection Configuration
*
* Configure PDF document structure analysis and hierarchy detection
* using k-clustering for document organization recognition.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
// Hierarchy detection in PDF options array
$config = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 6,
'include_bbox' => true,
'ocr_coverage_threshold' => 0.8
]
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Hierarchy detection enabled\n";
echo "Content length: " . strlen($result->content) . " characters\n";
// Alternative: Custom hierarchy parameters for complex documents
$advancedConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 12, // More clusters for detailed hierarchy
'include_bbox' => true, // Include bounding box coordinates
'ocr_coverage_threshold' => 0.7 // Higher OCR threshold
]
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('complex_document.pdf');
echo "Advanced hierarchy detection completed\n";
echo "Detected structure preserved in output\n";
// Disabling hierarchy detection for speed
$fastConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: false,
hierarchy: [
'enabled' => false
]
)
);
$kreuzberg = new Kreuzberg($fastConfig);
$result = $kreuzberg->extractFile('simple_document.pdf');
echo "Fast extraction without hierarchy detection\n";
```