Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\LanguageDetectionConfig;
// Configure language detection with confidence threshold
$langConfig = new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.7,
detectMultiple: false
);
$config = new ExtractionConfig();
$config->language_detection = $langConfig;
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
// Access detected languages
if (!empty($result->languages)) {
foreach ($result->languages as $lang) {
echo "Detected language: " . $lang->code . "\n";
if ($lang->confidence !== null) {
echo "Confidence: " . $lang->confidence . "\n";
}
}
}
?>
```

View File

@@ -0,0 +1,37 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\LanguageDetectionConfig;
// Configure multilingual language detection
$langConfig = new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.6,
detectMultiple: true
);
$config = new ExtractionConfig();
$config->language_detection = $langConfig;
$result = Kreuzberg::extract_file_sync("multilingual_document.pdf", null, $config);
// Iterate through all detected languages
if (!empty($result->languages)) {
echo "Detected " . count($result->languages) . " language(s):\n";
foreach ($result->languages as $lang) {
echo "Language: " . $lang->code . "\n";
if ($lang->confidence !== null) {
printf(" Confidence: %.1f%%\n", $lang->confidence * 100);
}
if ($lang->name !== null) {
echo " Name: " . $lang->name . "\n";
}
}
} else {
echo "No languages detected\n";
}
?>
```

View File

@@ -0,0 +1,63 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
// Extract PDF metadata
$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
if ($result->metadata?->pdf) {
$pdfMeta = $result->metadata->pdf;
if ($pdfMeta->page_count !== null) {
echo "Pages: " . $pdfMeta->page_count . "\n";
}
if ($pdfMeta->author !== null) {
echo "Author: " . $pdfMeta->author . "\n";
}
if ($pdfMeta->title !== null) {
echo "Title: " . $pdfMeta->title . "\n";
}
}
// Extract HTML metadata
$htmlResult = Kreuzberg::extract_file_sync("page.html", null, new ExtractionConfig());
if ($htmlResult->metadata?->html) {
$htmlMeta = $htmlResult->metadata->html;
if ($htmlMeta->title !== null) {
echo "Title: " . $htmlMeta->title . "\n";
}
if ($htmlMeta->description !== null) {
echo "Description: " . $htmlMeta->description . "\n";
}
// Access keywords array
echo "Keywords: " . implode(", ", $htmlMeta->keywords ?? []) . "\n";
// Access canonical URL
if ($htmlMeta->canonical_url !== null) {
echo "Canonical: " . $htmlMeta->canonical_url . "\n";
}
// Access Open Graph fields
if (!empty($htmlMeta->open_graph)) {
if (isset($htmlMeta->open_graph["image"])) {
echo "OG Image: " . $htmlMeta->open_graph["image"] . "\n";
}
}
// Access language
if ($htmlMeta->language !== null) {
echo "Language: " . $htmlMeta->language . "\n";
}
// Access headers
if (!empty($htmlMeta->headers)) {
foreach ($htmlMeta->headers as $header) {
echo "Header (level " . $header->level . "): " . $header->text . "\n";
}
}
}
?>
```

View File

@@ -0,0 +1,33 @@
```php title="metadata.php"
<?php
declare(strict_types=1);
/**
* Document Metadata Access
*
* Extract and access metadata from different document types including
* PDFs, HTML, and other formats.
*/
require_once __DIR__ . '/vendor/autoload.php';
use function Kreuzberg\extract_file;
$result = extract_file('document.pdf');
if (isset($result->metadata->pdf)) {
$pdfMeta = $result->metadata->pdf;
echo "Pages: " . ($pdfMeta['page_count'] ?? 'N/A') . "\n";
echo "Author: " . ($pdfMeta['author'] ?? 'N/A') . "\n";
echo "Title: " . ($pdfMeta['title'] ?? 'N/A') . "\n";
}
$htmlResult = extract_file('page.html');
if (isset($htmlResult->metadata->html)) {
$htmlMeta = $htmlResult->metadata->html;
echo "Title: " . ($htmlMeta['title'] ?? 'N/A') . "\n";
echo "Description: " . ($htmlMeta['description'] ?? 'N/A') . "\n";
}
```

View File

@@ -0,0 +1,29 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\PageConfig;
$config = new ExtractionConfig();
$config->pages = new PageConfig(
extractPages: true,
insertPageMarkers: true,
markerFormat: "\n\n=== PAGE {page_num} ===\n\n"
);
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
// Content with inline page markers
echo "Full content with markers:\n";
echo $result->content . "\n\n";
// Or access pages separately with boundaries preserved
if ($result->pages !== null) {
foreach ($result->pages as $page) {
echo "--- Page " . $page->page_number . " (boundary) ---\n";
echo $page->content . "\n";
}
}
?>
```

View File

@@ -0,0 +1,37 @@
```php title="page_boundaries.php"
<?php
declare(strict_types=1);
/**
* Page Boundary Tracking
*
* Access page boundary information to extract content from specific pages
* using byte offsets in the extracted content.
*/
require_once __DIR__ . '/vendor/autoload.php';
use function Kreuzberg\extract_file;
$result = extract_file('document.pdf');
if (isset($result->metadata->pages->boundaries) && !empty($result->metadata->pages->boundaries)) {
$boundaries = $result->metadata->pages->boundaries;
$contentBytes = $result->content;
$pagesToShow = array_slice($boundaries, 0, 3);
foreach ($pagesToShow as $boundary) {
$pageContent = mb_substr(
$contentBytes,
$boundary->byteStart,
$boundary->byteEnd - $boundary->byteStart
);
echo "Page {$boundary->pageNumber}:\n";
echo " Byte range: {$boundary->byteStart}-{$boundary->byteEnd}\n";
echo " Preview: " . mb_substr($pageContent, 0, 100) . "...\n\n";
}
}
```

View File

@@ -0,0 +1,26 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\PageConfig;
$config = new ExtractionConfig();
$config->pages = new PageConfig(
extractPages: true,
insertPageMarkers: false,
markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
);
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
if ($result->pages !== null) {
foreach ($result->pages as $page) {
echo "Page " . $page->page_number . ":\n";
echo " Content: " . strlen($page->content) . " chars\n";
echo " Tables: " . count($page->tables ?? []) . "\n";
echo " Images: " . count($page->images ?? []) . "\n";
}
}
?>
```

View File

@@ -0,0 +1,36 @@
```php title="page_tracking_basic.php"
<?php
declare(strict_types=1);
/**
* Basic Page Tracking
*
* Extract individual pages with their content, tables, and images
* using page extraction configuration.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PageConfig;
$config = new ExtractionConfig(
pages: new PageConfig(
extractPages: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
if (!empty($result->pages)) {
foreach ($result->pages as $page) {
echo "Page {$page->pageNumber}:\n";
echo " Content: " . strlen($page->content) . " chars\n";
echo " Tables: " . count($page->tables) . "\n";
echo " Images: " . count($page->images) . "\n\n";
}
}
```

View File

@@ -0,0 +1,114 @@
```php title="pdf_metadata_extractor.php"
<?php
declare(strict_types=1);
/**
* PDF Metadata Extractor Post-Processor
*
* Custom post-processor that extracts and enriches PDF metadata
* during the extraction pipeline.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\PostProcessor\PostProcessorInterface;
use Kreuzberg\Types\ExtractionResult;
use Kreuzberg\Kreuzberg;
/**
* Post-processor for extracting and enriching PDF metadata
*/
readonly class PdfMetadataExtractor implements PostProcessorInterface
{
private int $processedCount;
public function __construct()
{
$this->processedCount = 0;
}
/**
* Get the name of this post-processor
*/
public function name(): string
{
return 'pdf_metadata_extractor';
}
/**
* Get the version of this post-processor
*/
public function version(): string
{
return '1.0.0';
}
/**
* Get the description of this post-processor
*/
public function description(): string
{
return 'Extracts and enriches PDF metadata';
}
/**
* Get the processing stage (early, normal, or late)
*/
public function processingStage(): string
{
return 'early';
}
/**
* Determine if this processor should handle the result
*/
public function shouldProcess(ExtractionResult $result): bool
{
return $result->mimeType === 'application/pdf';
}
/**
* Process the extraction result
*/
public function process(ExtractionResult $result): ExtractionResult
{
$this->processedCount++;
if (!isset($result->metadata->custom)) {
$result->metadata->custom = [];
}
$result->metadata->custom['pdf_processed'] = true;
$result->metadata->custom['processor_version'] = $this->version();
return $result;
}
/**
* Initialize the post-processor
*/
public function initialize(): void
{
error_log("PDF metadata extractor initialized");
}
/**
* Shutdown the post-processor
*/
public function shutdown(): void
{
error_log("Processed {$this->processedCount} PDFs");
}
/**
* Get the number of processed documents
*/
public function getProcessedCount(): int
{
return $this->processedCount;
}
}
$processor = new PdfMetadataExtractor();
Kreuzberg::registerPostProcessor($processor);
```

View File

@@ -0,0 +1,22 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
foreach ($result->tables as $table) {
echo "Table on page " . $table->page_number . " with " . count($table->cells) . " rows\n";
echo "Markdown representation:\n";
echo $table->markdown . "\n";
// Access cell data
foreach ($table->cells as $rowIndex => $row) {
foreach ($row as $colIndex => $cellContent) {
echo "Cell[$rowIndex][$colIndex]: $cellContent\n";
}
}
}
?>
```

View File

@@ -0,0 +1,50 @@
```php title="PHP"
<?php declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\EmbeddingConfig;
// Configure chunking with embedding generation for vector database
$chunkConfig = new ChunkingConfig(
enableChunking: true,
chunkSize: 512,
chunkOverlap: 50,
chunker: "semantic"
);
$embeddingConfig = new EmbeddingConfig(
generateEmbeddings: true,
modelName: "all-minilm-l6-v2"
);
$config = new ExtractionConfig();
$config->chunking = $chunkConfig;
$config->embeddings = $embeddingConfig;
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
// Store chunks and embeddings for vector database
if ($result->chunks !== null) {
foreach ($result->chunks as $chunk) {
// Store in vector database with embedding
$vectorRecord = [
"text" => $chunk->text,
"embedding" => $chunk->embedding ?? [],
"metadata" => [
"source" => "document.pdf",
"page" => $chunk->page_number ?? null,
"chunk_id" => $chunk->chunk_id ?? null,
]
];
// Insert into vector DB (e.g., Pinecone, Weaviate, Milvus)
// storeInVectorDB($vectorRecord);
echo "Chunk: " . substr($chunk->text, 0, 50) . "...\n";
echo "Embedding dimensions: " . count($chunk->embedding ?? []) . "\n";
}
}
?>
```