Files
fil/docs/snippets/php/configuration/pdf_hierarchy_config.php
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

72 lines
1.9 KiB
PHP

```php title="pdf_hierarchy_config.php"
<?php
declare(strict_types=1);
/**
* PdfHierarchyConfig - Hierarchy Detection Configuration
*
* Configure PDF document structure analysis and hierarchy detection
* using k-clustering for document organization recognition.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
// Hierarchy detection in PDF options array
$config = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 6,
'include_bbox' => true,
'ocr_coverage_threshold' => 0.8
]
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Hierarchy detection enabled\n";
echo "Content length: " . strlen($result->content) . " characters\n";
// Alternative: Custom hierarchy parameters for complex documents
$advancedConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
hierarchy: [
'enabled' => true,
'k_clusters' => 12, // More clusters for detailed hierarchy
'include_bbox' => true, // Include bounding box coordinates
'ocr_coverage_threshold' => 0.7 // Higher OCR threshold
]
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('complex_document.pdf');
echo "Advanced hierarchy detection completed\n";
echo "Detected structure preserved in output\n";
// Disabling hierarchy detection for speed
$fastConfig = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: false,
hierarchy: [
'enabled' => false
]
)
);
$kreuzberg = new Kreuzberg($fastConfig);
$result = $kreuzberg->extractFile('simple_document.pdf');
echo "Fast extraction without hierarchy detection\n";
```