Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,82 @@
```php title="page_config.php"
<?php
declare(strict_types=1);
/**
* PageConfig - Page-Level Extraction
*
* Configure per-page content extraction and page markers for maintaining
* document structure in the extracted text.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PageConfig;
$config = new ExtractionConfig(
page: new PageConfig(
extractPages: false,
insertPageMarkers: true,
markerFormat: '--- Page {page_number} ---'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('report.pdf');
echo "Content with page markers:\n";
echo str_repeat('=', 60) . "\n";
echo $result->content . "\n\n";
$pageConfig = new ExtractionConfig(
page: new PageConfig(
extractPages: true,
insertPageMarkers: false
)
);
$kreuzberg = new Kreuzberg($pageConfig);
$result = $kreuzberg->extractFile('multi_page.pdf');
foreach ($result->pages ?? [] as $page) {
echo "Page {$page->pageNumber}:\n";
echo str_repeat('-', 60) . "\n";
echo substr($page->content, 0, 200) . "...\n";
echo "Tables on page: " . count($page->tables) . "\n";
echo "Images on page: " . count($page->images) . "\n\n";
}
$customConfig = new ExtractionConfig(
page: new PageConfig(
extractPages: false,
insertPageMarkers: true,
markerFormat: "\n\n========== PAGE {page_number} ==========\n\n"
)
);
$kreuzberg = new Kreuzberg($customConfig);
$result = $kreuzberg->extractFile('document.pdf');
$pages = preg_split('/={10} PAGE \d+ ={10}/', $result->content);
echo "Split into " . count($pages) . " sections\n";
$allPagesConfig = new ExtractionConfig(
page: new PageConfig(extractPages: true)
);
$kreuzberg = new Kreuzberg($allPagesConfig);
$result = $kreuzberg->extractFile('large_doc.pdf');
$selectedPages = array_filter(
$result->pages ?? [],
fn($page) => $page->pageNumber >= 10 && $page->pageNumber <= 20
);
echo "\nSelected pages 10-20:\n";
foreach ($selectedPages as $page) {
echo "Page {$page->pageNumber}: " . strlen($page->content) . " chars\n";
}
```