This commit is contained in:
30
docs/snippets/php/metadata/language_detection.md
Normal file
30
docs/snippets/php/metadata/language_detection.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
// Configure language detection with confidence threshold
|
||||
$langConfig = new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.7,
|
||||
detectMultiple: false
|
||||
);
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->language_detection = $langConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
// Access detected languages
|
||||
if (!empty($result->languages)) {
|
||||
foreach ($result->languages as $lang) {
|
||||
echo "Detected language: " . $lang->code . "\n";
|
||||
if ($lang->confidence !== null) {
|
||||
echo "Confidence: " . $lang->confidence . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
@@ -0,0 +1,37 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
// Configure multilingual language detection
|
||||
$langConfig = new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.6,
|
||||
detectMultiple: true
|
||||
);
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->language_detection = $langConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("multilingual_document.pdf", null, $config);
|
||||
|
||||
// Iterate through all detected languages
|
||||
if (!empty($result->languages)) {
|
||||
echo "Detected " . count($result->languages) . " language(s):\n";
|
||||
|
||||
foreach ($result->languages as $lang) {
|
||||
echo "Language: " . $lang->code . "\n";
|
||||
if ($lang->confidence !== null) {
|
||||
printf(" Confidence: %.1f%%\n", $lang->confidence * 100);
|
||||
}
|
||||
if ($lang->name !== null) {
|
||||
echo " Name: " . $lang->name . "\n";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
echo "No languages detected\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
63
docs/snippets/php/metadata/metadata.md
Normal file
63
docs/snippets/php/metadata/metadata.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// Extract PDF metadata
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
|
||||
|
||||
if ($result->metadata?->pdf) {
|
||||
$pdfMeta = $result->metadata->pdf;
|
||||
if ($pdfMeta->page_count !== null) {
|
||||
echo "Pages: " . $pdfMeta->page_count . "\n";
|
||||
}
|
||||
if ($pdfMeta->author !== null) {
|
||||
echo "Author: " . $pdfMeta->author . "\n";
|
||||
}
|
||||
if ($pdfMeta->title !== null) {
|
||||
echo "Title: " . $pdfMeta->title . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Extract HTML metadata
|
||||
$htmlResult = Kreuzberg::extract_file_sync("page.html", null, new ExtractionConfig());
|
||||
|
||||
if ($htmlResult->metadata?->html) {
|
||||
$htmlMeta = $htmlResult->metadata->html;
|
||||
if ($htmlMeta->title !== null) {
|
||||
echo "Title: " . $htmlMeta->title . "\n";
|
||||
}
|
||||
if ($htmlMeta->description !== null) {
|
||||
echo "Description: " . $htmlMeta->description . "\n";
|
||||
}
|
||||
|
||||
// Access keywords array
|
||||
echo "Keywords: " . implode(", ", $htmlMeta->keywords ?? []) . "\n";
|
||||
|
||||
// Access canonical URL
|
||||
if ($htmlMeta->canonical_url !== null) {
|
||||
echo "Canonical: " . $htmlMeta->canonical_url . "\n";
|
||||
}
|
||||
|
||||
// Access Open Graph fields
|
||||
if (!empty($htmlMeta->open_graph)) {
|
||||
if (isset($htmlMeta->open_graph["image"])) {
|
||||
echo "OG Image: " . $htmlMeta->open_graph["image"] . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Access language
|
||||
if ($htmlMeta->language !== null) {
|
||||
echo "Language: " . $htmlMeta->language . "\n";
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if (!empty($htmlMeta->headers)) {
|
||||
foreach ($htmlMeta->headers as $header) {
|
||||
echo "Header (level " . $header->level . "): " . $header->text . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
33
docs/snippets/php/metadata/metadata.php
Normal file
33
docs/snippets/php/metadata/metadata.php
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="metadata.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Document Metadata Access
|
||||
*
|
||||
* Extract and access metadata from different document types including
|
||||
* PDFs, HTML, and other formats.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
|
||||
if (isset($result->metadata->pdf)) {
|
||||
$pdfMeta = $result->metadata->pdf;
|
||||
echo "Pages: " . ($pdfMeta['page_count'] ?? 'N/A') . "\n";
|
||||
echo "Author: " . ($pdfMeta['author'] ?? 'N/A') . "\n";
|
||||
echo "Title: " . ($pdfMeta['title'] ?? 'N/A') . "\n";
|
||||
}
|
||||
|
||||
$htmlResult = extract_file('page.html');
|
||||
|
||||
if (isset($htmlResult->metadata->html)) {
|
||||
$htmlMeta = $htmlResult->metadata->html;
|
||||
echo "Title: " . ($htmlMeta['title'] ?? 'N/A') . "\n";
|
||||
echo "Description: " . ($htmlMeta['description'] ?? 'N/A') . "\n";
|
||||
}
|
||||
```
|
||||
29
docs/snippets/php/metadata/page_boundaries.md
Normal file
29
docs/snippets/php/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->pages = new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: "\n\n=== PAGE {page_num} ===\n\n"
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
// Content with inline page markers
|
||||
echo "Full content with markers:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
// Or access pages separately with boundaries preserved
|
||||
if ($result->pages !== null) {
|
||||
foreach ($result->pages as $page) {
|
||||
echo "--- Page " . $page->page_number . " (boundary) ---\n";
|
||||
echo $page->content . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
37
docs/snippets/php/metadata/page_boundaries.php
Normal file
37
docs/snippets/php/metadata/page_boundaries.php
Normal file
@@ -0,0 +1,37 @@
|
||||
```php title="page_boundaries.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Page Boundary Tracking
|
||||
*
|
||||
* Access page boundary information to extract content from specific pages
|
||||
* using byte offsets in the extracted content.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
|
||||
if (isset($result->metadata->pages->boundaries) && !empty($result->metadata->pages->boundaries)) {
|
||||
$boundaries = $result->metadata->pages->boundaries;
|
||||
$contentBytes = $result->content;
|
||||
|
||||
$pagesToShow = array_slice($boundaries, 0, 3);
|
||||
|
||||
foreach ($pagesToShow as $boundary) {
|
||||
$pageContent = mb_substr(
|
||||
$contentBytes,
|
||||
$boundary->byteStart,
|
||||
$boundary->byteEnd - $boundary->byteStart
|
||||
);
|
||||
|
||||
echo "Page {$boundary->pageNumber}:\n";
|
||||
echo " Byte range: {$boundary->byteStart}-{$boundary->byteEnd}\n";
|
||||
echo " Preview: " . mb_substr($pageContent, 0, 100) . "...\n\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/php/metadata/page_tracking_basic.md
Normal file
26
docs/snippets/php/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->pages = new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: false,
|
||||
markerFormat: "\n\n<!-- PAGE {page_num} -->\n\n"
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
if ($result->pages !== null) {
|
||||
foreach ($result->pages as $page) {
|
||||
echo "Page " . $page->page_number . ":\n";
|
||||
echo " Content: " . strlen($page->content) . " chars\n";
|
||||
echo " Tables: " . count($page->tables ?? []) . "\n";
|
||||
echo " Images: " . count($page->images ?? []) . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
36
docs/snippets/php/metadata/page_tracking_basic.php
Normal file
36
docs/snippets/php/metadata/page_tracking_basic.php
Normal file
@@ -0,0 +1,36 @@
|
||||
```php title="page_tracking_basic.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic Page Tracking
|
||||
*
|
||||
* Extract individual pages with their content, tables, and images
|
||||
* using page extraction configuration.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
pages: new PageConfig(
|
||||
extractPages: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
if (!empty($result->pages)) {
|
||||
foreach ($result->pages as $page) {
|
||||
echo "Page {$page->pageNumber}:\n";
|
||||
echo " Content: " . strlen($page->content) . " chars\n";
|
||||
echo " Tables: " . count($page->tables) . "\n";
|
||||
echo " Images: " . count($page->images) . "\n\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
114
docs/snippets/php/metadata/pdf_metadata_extractor.php
Normal file
114
docs/snippets/php/metadata/pdf_metadata_extractor.php
Normal file
@@ -0,0 +1,114 @@
|
||||
```php title="pdf_metadata_extractor.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PDF Metadata Extractor Post-Processor
|
||||
*
|
||||
* Custom post-processor that extracts and enriches PDF metadata
|
||||
* during the extraction pipeline.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\PostProcessor\PostProcessorInterface;
|
||||
use Kreuzberg\Types\ExtractionResult;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
/**
|
||||
* Post-processor for extracting and enriching PDF metadata
|
||||
*/
|
||||
readonly class PdfMetadataExtractor implements PostProcessorInterface
|
||||
{
|
||||
private int $processedCount;
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->processedCount = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name of this post-processor
|
||||
*/
|
||||
public function name(): string
|
||||
{
|
||||
return 'pdf_metadata_extractor';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of this post-processor
|
||||
*/
|
||||
public function version(): string
|
||||
{
|
||||
return '1.0.0';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the description of this post-processor
|
||||
*/
|
||||
public function description(): string
|
||||
{
|
||||
return 'Extracts and enriches PDF metadata';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the processing stage (early, normal, or late)
|
||||
*/
|
||||
public function processingStage(): string
|
||||
{
|
||||
return 'early';
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if this processor should handle the result
|
||||
*/
|
||||
public function shouldProcess(ExtractionResult $result): bool
|
||||
{
|
||||
return $result->mimeType === 'application/pdf';
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the extraction result
|
||||
*/
|
||||
public function process(ExtractionResult $result): ExtractionResult
|
||||
{
|
||||
$this->processedCount++;
|
||||
|
||||
if (!isset($result->metadata->custom)) {
|
||||
$result->metadata->custom = [];
|
||||
}
|
||||
$result->metadata->custom['pdf_processed'] = true;
|
||||
$result->metadata->custom['processor_version'] = $this->version();
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the post-processor
|
||||
*/
|
||||
public function initialize(): void
|
||||
{
|
||||
error_log("PDF metadata extractor initialized");
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown the post-processor
|
||||
*/
|
||||
public function shutdown(): void
|
||||
{
|
||||
error_log("Processed {$this->processedCount} PDFs");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of processed documents
|
||||
*/
|
||||
public function getProcessedCount(): int
|
||||
{
|
||||
return $this->processedCount;
|
||||
}
|
||||
}
|
||||
|
||||
$processor = new PdfMetadataExtractor();
|
||||
Kreuzberg::registerPostProcessor($processor);
|
||||
```
|
||||
22
docs/snippets/php/metadata/tables.md
Normal file
22
docs/snippets/php/metadata/tables.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
|
||||
|
||||
foreach ($result->tables as $table) {
|
||||
echo "Table on page " . $table->page_number . " with " . count($table->cells) . " rows\n";
|
||||
echo "Markdown representation:\n";
|
||||
echo $table->markdown . "\n";
|
||||
|
||||
// Access cell data
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
foreach ($row as $colIndex => $cellContent) {
|
||||
echo "Cell[$rowIndex][$colIndex]: $cellContent\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
50
docs/snippets/php/metadata/vector_database_integration.md
Normal file
50
docs/snippets/php/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,50 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
// Configure chunking with embedding generation for vector database
|
||||
$chunkConfig = new ChunkingConfig(
|
||||
enableChunking: true,
|
||||
chunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
chunker: "semantic"
|
||||
);
|
||||
|
||||
$embeddingConfig = new EmbeddingConfig(
|
||||
generateEmbeddings: true,
|
||||
modelName: "all-minilm-l6-v2"
|
||||
);
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->chunking = $chunkConfig;
|
||||
$config->embeddings = $embeddingConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
// Store chunks and embeddings for vector database
|
||||
if ($result->chunks !== null) {
|
||||
foreach ($result->chunks as $chunk) {
|
||||
// Store in vector database with embedding
|
||||
$vectorRecord = [
|
||||
"text" => $chunk->text,
|
||||
"embedding" => $chunk->embedding ?? [],
|
||||
"metadata" => [
|
||||
"source" => "document.pdf",
|
||||
"page" => $chunk->page_number ?? null,
|
||||
"chunk_id" => $chunk->chunk_id ?? null,
|
||||
]
|
||||
];
|
||||
|
||||
// Insert into vector DB (e.g., Pinecone, Weaviate, Milvus)
|
||||
// storeInVectorDB($vectorRecord);
|
||||
|
||||
echo "Chunk: " . substr($chunk->text, 0, 50) . "...\n";
|
||||
echo "Embedding dimensions: " . count($chunk->embedding ?? []) . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
Reference in New Issue
Block a user