Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,48 @@
```php title="chunking.php"
<?php
declare(strict_types=1);
/**
* Text Chunking Configuration
*
* Configure document chunking for processing long texts into manageable pieces.
* Useful for RAG systems, embedding generation, and token limit management.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 1500,
maxOverlap: 200,
embedding: new EmbeddingConfig(
model: 'balanced'
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Chunking Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Total chunks created: " . count($result->chunks ?? []) . "\n\n";
foreach ($result->chunks ?? [] as $index => $chunk) {
echo "Chunk " . ($index + 1) . ":\n";
echo " Length: " . strlen($chunk->content) . " characters\n";
echo " Preview: " . substr($chunk->content, 0, 100) . "...\n";
if ($chunk->embedding !== null) {
echo " Embedding dimensions: " . count($chunk->embedding) . "\n";
}
echo "\n";
}
```

View File

@@ -0,0 +1,80 @@
```php title="chunking_rag.php"
<?php
declare(strict_types=1);
/**
* Chunking for RAG (Retrieval-Augmented Generation)
*
* Advanced chunking configuration optimized for RAG systems with embeddings.
* Demonstrates how to process documents into chunks with embeddings for
* vector database storage and semantic search.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 500,
maxOverlap: 50,
embedding: new EmbeddingConfig(
model: 'balanced',
normalize: true,
batchSize: 16
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('research_paper.pdf');
echo "RAG Chunking Results:\n";
echo str_repeat('=', 60) . "\n";
$chunksWithEmbeddings = [];
foreach ($result->chunks ?? [] as $chunk) {
if ($chunk->embedding !== null) {
$chunksWithEmbeddings[] = [
'content' => substr($chunk->content, 0, 100) . '...',
'embedding_dims' => count($chunk->embedding),
'full_content' => $chunk->content,
'embedding' => $chunk->embedding,
];
}
}
echo "Chunks with embeddings: " . count($chunksWithEmbeddings) . "\n\n";
echo "Sample chunks for vector database:\n";
echo str_repeat('=', 60) . "\n";
foreach (array_slice($chunksWithEmbeddings, 0, 3) as $index => $chunk) {
echo "Chunk " . ($index + 1) . ":\n";
echo " Content preview: {$chunk['content']}\n";
echo " Embedding dimensions: {$chunk['embedding_dims']}\n";
echo " Ready for vector DB: Yes\n\n";
}
$vectorDbRecords = array_map(
fn($chunk, $idx) => [
'id' => sprintf('doc_%s_chunk_%d', md5('research_paper.pdf'), $idx),
'content' => $chunk['full_content'],
'embedding' => $chunk['embedding'],
'metadata' => [
'source' => 'research_paper.pdf',
'chunk_index' => $idx,
'char_count' => strlen($chunk['full_content']),
],
],
$chunksWithEmbeddings,
array_keys($chunksWithEmbeddings)
);
echo "Prepared " . count($vectorDbRecords) . " records for vector database\n";
echo "Each record contains: id, content, embedding, and metadata\n";
```

View File

@@ -0,0 +1,81 @@
```php title="embedding_with_chunking.php"
<?php
declare(strict_types=1);
/**
* Embedding Generation with Chunking
*
* Configure chunking with automatic embedding generation for each chunk.
* Ideal for semantic search, similarity matching, and vector databases.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 1024,
maxOverlap: 100,
embedding: new EmbeddingConfig(
model: 'balanced',
normalize: true,
batchSize: 32,
showDownloadProgress: false
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Embedding Generation Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
$chunksWithEmbeddings = 0;
$totalEmbeddingDimensions = 0;
foreach ($result->chunks ?? [] as $chunk) {
if ($chunk->embedding !== null) {
$chunksWithEmbeddings++;
$totalEmbeddingDimensions = count($chunk->embedding);
}
}
echo "Chunks with embeddings: $chunksWithEmbeddings\n";
echo "Embedding dimensions: $totalEmbeddingDimensions\n";
echo "Coverage: " . ($chunksWithEmbeddings > 0
? sprintf("%.1f%%", ($chunksWithEmbeddings / count($result->chunks ?? [1])) * 100)
: "0%") . "\n\n";
if (!empty($result->chunks) && $result->chunks[0]->embedding !== null) {
$sampleChunk = $result->chunks[0];
echo "Sample Chunk:\n";
echo str_repeat('=', 60) . "\n";
echo "Content preview: " . substr($sampleChunk->content, 0, 150) . "...\n";
echo "Content length: " . strlen($sampleChunk->content) . " chars\n";
echo "Embedding dimensions: " . count($sampleChunk->embedding) . "\n";
echo "First 5 embedding values: [";
echo implode(', ', array_map(
fn($v) => number_format($v, 4),
array_slice($sampleChunk->embedding, 0, 5)
));
echo ", ...]\n\n";
}
if (!empty($result->chunks)) {
$totalChars = array_sum(array_map(
fn($chunk) => strlen($chunk->content),
$result->chunks
));
$avgChunkSize = $totalChars / count($result->chunks);
echo "Average chunk size: " . round($avgChunkSize) . " characters\n";
}
```

View File

@@ -0,0 +1,114 @@
```php title="error_handling.php"
<?php
declare(strict_types=1);
/**
* Comprehensive Error Handling
*
* Demonstrate proper error handling for document extraction operations.
* Shows how to catch and handle different types of Kreuzberg exceptions.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Exceptions\KreuzbergException;
use Kreuzberg\Exceptions\ParsingException;
use Kreuzberg\Exceptions\OcrException;
use Kreuzberg\Exceptions\ValidationException;
$kreuzberg = new Kreuzberg();
try {
$result = $kreuzberg->extractFile('document.pdf');
echo "Extracted " . strlen($result->content) . " characters\n";
} catch (ParsingException $e) {
echo "Failed to parse document: " . $e->getMessage() . "\n";
echo "Error code: " . $e->getCode() . "\n";
} catch (OcrException $e) {
echo "OCR processing failed: " . $e->getMessage() . "\n";
echo "Suggestion: Check if document is scanned and OCR is properly configured\n";
} catch (KreuzbergException $e) {
echo "Extraction error: " . $e->getMessage() . "\n";
if ($e->getPrevious() !== null) {
echo "Caused by: " . $e->getPrevious()->getMessage() . "\n";
}
}
try {
$config = new ExtractionConfig();
$pdfBytes = file_get_contents('sample.pdf');
if ($pdfBytes === false) {
throw new \RuntimeException('Failed to read file');
}
$result = $kreuzberg->extractBytes($pdfBytes, 'application/pdf', $config);
echo "Extracted from bytes: " . substr($result->content, 0, 100) . "...\n";
} catch (ValidationException $e) {
echo "Invalid configuration or input: " . $e->getMessage() . "\n";
echo "Details: " . $e->getFile() . " at line " . $e->getLine() . "\n";
} catch (OcrException $e) {
echo "OCR failed: " . $e->getMessage() . "\n";
} catch (KreuzbergException $e) {
echo "Extraction failed: " . $e->getMessage() . "\n";
} catch (\RuntimeException $e) {
echo "File system error: " . $e->getMessage() . "\n";
}
$files = ['doc1.pdf', 'corrupted.pdf', 'doc3.docx'];
$successfulExtractions = [];
$failedExtractions = [];
foreach ($files as $file) {
try {
$result = $kreuzberg->extractFile($file);
$successfulExtractions[$file] = $result;
echo "Success: $file\n";
} catch (KreuzbergException $e) {
$failedExtractions[$file] = [
'error' => $e->getMessage(),
'type' => get_class($e),
];
echo "Failed: $file - " . $e->getMessage() . "\n";
}
}
echo "\nResults:\n";
echo "Successful: " . count($successfulExtractions) . "\n";
echo "Failed: " . count($failedExtractions) . "\n";
function extractWithRetry(
Kreuzberg $kreuzberg,
string $file,
int $maxRetries = 3
): ?\Kreuzberg\Result\ExtractionResult {
$attempt = 0;
while ($attempt < $maxRetries) {
try {
return $kreuzberg->extractFile($file);
} catch (OcrException $e) {
$attempt++;
if ($attempt >= $maxRetries) {
echo "OCR failed after $maxRetries attempts: " . $e->getMessage() . "\n";
return null;
}
echo "OCR attempt $attempt failed, retrying...\n";
sleep(1);
} catch (KreuzbergException $e) {
echo "Fatal error (no retry): " . $e->getMessage() . "\n";
return null;
}
}
return null;
}
$result = extractWithRetry($kreuzberg, 'difficult_scan.pdf');
if ($result !== null) {
echo "Successfully extracted with retry: " . strlen($result->content) . " chars\n";
}
```

View File

@@ -0,0 +1,160 @@
```php title="error_handling_extract.php"
<?php
declare(strict_types=1);
/**
* Error Handling for HTTP/API Extraction
*
* Demonstrate error handling when using Kreuzberg extraction via HTTP API.
* Shows how to properly handle HTTP errors and API response errors.
*/
require_once __DIR__ . '/vendor/autoload.php';
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ServerException;
/**
* Extract document via HTTP API with error handling
*
* @param string $filePath Path to the document file
* @param string $apiUrl API endpoint URL
* @return array|null Extraction results or null on error
*/
function extractViaApi(string $filePath, string $apiUrl = 'http://localhost:8000/extract'): ?array
{
$client = new Client([
'timeout' => 30.0,
'connect_timeout' => 5.0,
]);
try {
if (!file_exists($filePath)) {
throw new \RuntimeException("File not found: $filePath");
}
$response = $client->post($apiUrl, [
'multipart' => [
[
'name' => 'files',
'contents' => fopen($filePath, 'r'),
'filename' => basename($filePath),
],
],
]);
$results = json_decode($response->getBody()->getContents(), true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \RuntimeException('Invalid JSON response: ' . json_last_error_msg());
}
echo "Success: Extracted " . count($results) . " documents\n";
return $results;
} catch (ClientException $e) {
$response = $e->getResponse();
$statusCode = $response->getStatusCode();
$body = json_decode($response->getBody()->getContents(), true);
$errorType = $body['error_type'] ?? 'Unknown';
$message = $body['message'] ?? 'No message provided';
echo "Client Error ($statusCode): $errorType\n";
echo "Message: $message\n";
if (isset($body['details'])) {
echo "Details: " . json_encode($body['details']) . "\n";
}
return null;
} catch (ServerException $e) {
$response = $e->getResponse();
$statusCode = $response->getStatusCode();
echo "Server Error ($statusCode): " . $e->getMessage() . "\n";
echo "The API server encountered an error. Please try again later.\n";
return null;
} catch (RequestException $e) {
echo "Request Error: " . $e->getMessage() . "\n";
if ($e->hasResponse()) {
echo "Response code: " . $e->getResponse()->getStatusCode() . "\n";
} else {
echo "No response received - check if the API server is running\n";
}
return null;
} catch (\RuntimeException $e) {
echo "Runtime Error: " . $e->getMessage() . "\n";
return null;
}
}
echo "Attempting to extract document via API...\n";
echo str_repeat('=', 60) . "\n";
$result = extractViaApi('document.pdf');
if ($result !== null) {
foreach ($result as $doc) {
$contentLength = strlen($doc['content'] ?? '');
$mimeType = $doc['mime_type'] ?? 'unknown';
echo "\nDocument extracted:\n";
echo " Content length: $contentLength characters\n";
echo " MIME type: $mimeType\n";
if (isset($doc['metadata'])) {
echo " Metadata keys: " . implode(', ', array_keys($doc['metadata'])) . "\n";
}
}
} else {
echo "\nExtraction failed. Check the error messages above.\n";
}
function extractWithRetry(
string $filePath,
string $apiUrl = 'http://localhost:8000/extract',
int $maxRetries = 3,
float $initialDelay = 1.0
): ?array {
$attempt = 0;
$delay = $initialDelay;
while ($attempt < $maxRetries) {
$result = extractViaApi($filePath, $apiUrl);
if ($result !== null) {
return $result;
}
$attempt++;
if ($attempt < $maxRetries) {
echo "\nRetrying in " . number_format($delay, 1) . " seconds... (Attempt " . ($attempt + 1) . "/$maxRetries)\n";
usleep((int)($delay * 1000000));
$delay *= 2;
}
}
echo "\nFailed after $maxRetries attempts\n";
return null;
}
echo "\n" . str_repeat('=', 60) . "\n";
echo "Extracting with retry logic...\n";
echo str_repeat('=', 60) . "\n";
$resultWithRetry = extractWithRetry('document.pdf', 'http://localhost:8000/extract');
if ($resultWithRetry !== null) {
echo "\nSuccessfully extracted with retry mechanism\n";
}
```

View File

@@ -0,0 +1,134 @@
```php title="image_extraction.php"
<?php
declare(strict_types=1);
/**
* Image Extraction from Documents
*
* Extract embedded images from PDF and other document formats.
* Demonstrates saving images, analyzing metadata, and processing image data.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
use Kreuzberg\Result\ExtractedImage;
$config = new ExtractionConfig(
extractImages: true,
pdf: new PdfConfig(
extractImages: true,
imageQuality: 90
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document_with_images.pdf');
echo "Image Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Total images extracted: " . count($result->images ?? []) . "\n\n";
$outputDir = './extracted_images';
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}
foreach ($result->images ?? [] as $index => $image) {
echo "Image " . ($index + 1) . ":\n";
echo str_repeat('-', 40) . "\n";
$filename = sprintf(
'page_%d_image_%d.%s',
$image->pageNumber ?? 0,
$image->imageIndex ?? $index,
$image->format ?? 'png'
);
$filepath = $outputDir . '/' . $filename;
$bytesWritten = file_put_contents($filepath, $image->data);
if ($bytesWritten !== false) {
echo " Saved: $filename\n";
echo " Size: {$image->width}x{$image->height} pixels\n";
echo " Format: {$image->format}\n";
echo " File size: " . number_format($bytesWritten) . " bytes\n";
echo " Page: " . ($image->pageNumber ?? 'N/A') . "\n";
if ($image->width > 0 && $image->height > 0) {
$aspectRatio = $image->width / $image->height;
echo " Aspect ratio: " . number_format($aspectRatio, 2) . ":1\n";
$orientation = $image->width > $image->height ? 'Landscape' : 'Portrait';
if (abs($image->width - $image->height) < 10) {
$orientation = 'Square';
}
echo " Orientation: $orientation\n";
}
echo "\n";
} else {
echo " Error: Failed to save image\n\n";
}
}
echo "Image Analysis:\n";
echo str_repeat('=', 60) . "\n";
if (!empty($result->images)) {
$largeImages = array_filter(
$result->images,
fn(ExtractedImage $img) => $img->width > 800 || $img->height > 800
);
echo "Large images (>800px): " . count($largeImages) . "\n";
$totalBytes = array_sum(
array_map(fn(ExtractedImage $img) => strlen($img->data), $result->images)
);
echo "Total image data: " . number_format($totalBytes / 1024, 2) . " KB\n";
$formatCounts = [];
foreach ($result->images as $image) {
$format = $image->format ?? 'unknown';
$formatCounts[$format] = ($formatCounts[$format] ?? 0) + 1;
}
echo "\nImages by format:\n";
foreach ($formatCounts as $format => $count) {
echo " $format: $count\n";
}
$totalWidth = array_sum(array_map(fn($img) => $img->width, $result->images));
$totalHeight = array_sum(array_map(fn($img) => $img->height, $result->images));
$imageCount = count($result->images);
echo "\nAverage dimensions: " .
round($totalWidth / $imageCount) . "x" .
round($totalHeight / $imageCount) . " pixels\n";
}
function createThumbnail(ExtractedImage $image, int $maxWidth = 200): ?string
{
if ($image->width <= $maxWidth) {
return null;
}
$scale = $maxWidth / $image->width;
$newHeight = (int)($image->height * $scale);
return "Thumbnail would be: {$maxWidth}x{$newHeight}";
}
echo "\nThumbnail recommendations:\n";
foreach ($result->images ?? [] as $index => $image) {
$thumbInfo = createThumbnail($image, 200);
if ($thumbInfo !== null) {
echo " Image " . ($index + 1) . ": $thumbInfo\n";
}
}
```

View File

@@ -0,0 +1,187 @@
```php title="image_preprocessing.php"
<?php
declare(strict_types=1);
/**
* Image Preprocessing for OCR
*
* Configure image preprocessing settings to improve OCR accuracy on scanned documents.
* Demonstrates various preprocessing techniques like denoising, deskewing, and contrast enhancement.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\OcrConfig;
use Kreuzberg\Config\TesseractConfig;
use Kreuzberg\Config\ImagePreprocessingConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
tesseractConfig: new TesseractConfig(
preprocessing: new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
)
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned.pdf');
echo "OCR with Image Preprocessing:\n";
echo str_repeat('=', 60) . "\n";
echo "Content extracted: " . strlen($result->content) . " characters\n";
echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
$advancedConfig = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng',
tesseractConfig: new TesseractConfig(
preprocessing: new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
sharpen: true,
removeBackground: true
),
pageSegmentationMode: 3,
engineMode: 3
)
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
echo "Advanced Preprocessing Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Content length: " . strlen($result->content) . " characters\n";
if (isset($result->metadata)) {
$qualityScore = $result->qualityScore ?? null;
$confidence = $result->metadata['ocr_confidence'] ?? null;
if ($qualityScore !== null) {
echo "Quality score: " . number_format($qualityScore, 2) . "\n";
if ($qualityScore < 0.5) {
echo "Warning: Low quality extraction detected\n";
echo "Recommendations:\n";
echo " - Increase target DPI (current: 600)\n";
echo " - Try different binarization method\n";
echo " - Consider rescanning the original document\n";
}
}
if ($confidence !== null) {
echo "OCR confidence: " . number_format($confidence * 100, 1) . "%\n";
}
}
echo "\n";
$preprocessingProfiles = [
'basic' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: false,
deskew: false,
contrastEnhance: false
),
'balanced' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
),
'aggressive' => new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
sharpen: true,
removeBackground: true
),
];
echo "Preprocessing Profile Comparison:\n";
echo str_repeat('=', 60) . "\n";
foreach ($preprocessingProfiles as $profileName => $preprocessing) {
$profileConfig = new ExtractionConfig(
ocr: new OcrConfig(
tesseractConfig: new TesseractConfig(
preprocessing: $preprocessing
)
)
);
$kreuzberg = new Kreuzberg($profileConfig);
$startTime = microtime(true);
$result = $kreuzberg->extractFile('sample_scan.pdf');
$elapsedTime = microtime(true) - $startTime;
echo ucfirst($profileName) . " profile:\n";
echo " Content length: " . strlen($result->content) . " characters\n";
echo " Processing time: " . number_format($elapsedTime, 3) . " seconds\n";
echo " Settings:\n";
echo " - DPI: {$preprocessing->targetDpi}\n";
echo " - Denoise: " . ($preprocessing->denoise ? 'Yes' : 'No') . "\n";
echo " - Deskew: " . ($preprocessing->deskew ? 'Yes' : 'No') . "\n";
echo " - Binarization: " . ($preprocessing->binarizationMethod ?? 'None') . "\n";
echo "\n";
}
function recommendPreprocessingSettings(string $documentType): ImagePreprocessingConfig
{
return match ($documentType) {
'modern_scan' => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: false,
binarizationMethod: 'otsu'
),
'old_document' => new ImagePreprocessingConfig(
targetDpi: 600,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'adaptive',
removeBackground: true
),
'newspaper' => new ImagePreprocessingConfig(
targetDpi: 400,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'sauvola',
removeBackground: true
),
default => new ImagePreprocessingConfig(
targetDpi: 300,
denoise: true,
deskew: true,
contrastEnhance: true,
binarizationMethod: 'otsu'
),
};
}
echo "Recommended preprocessing for old documents:\n";
$recommended = recommendPreprocessingSettings('old_document');
echo " Target DPI: {$recommended->targetDpi}\n";
echo " Binarization: {$recommended->binarizationMethod}\n";
```

View File

@@ -0,0 +1,200 @@
```php title="keyword_extraction_example.php"
<?php
declare(strict_types=1);
/**
* Keyword Extraction Example
*
* Extract keywords from documents using various algorithms.
* Demonstrates automatic keyword detection for document analysis and indexing.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\KeywordConfig;
use Kreuzberg\Enums\KeywordAlgorithm;
$config = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: KeywordAlgorithm::YAKE,
maxKeywords: 10,
minScore: 0.3
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('research_paper.pdf');
echo "Keyword Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: research_paper.pdf\n";
echo "Content length: " . strlen($result->content) . " characters\n\n";
$keywords = $result->metadata['keywords'] ?? [];
if (!empty($keywords)) {
echo "Extracted Keywords:\n";
echo str_repeat('-', 40) . "\n";
foreach ($keywords as $keyword) {
$text = $keyword['text'] ?? '';
$score = $keyword['score'] ?? 0.0;
$frequency = $keyword['frequency'] ?? null;
echo sprintf(" %-30s Score: %.3f", $text, $score);
if ($frequency !== null) {
echo sprintf(" (appears %d times)", $frequency);
}
echo "\n";
}
echo "\n";
} else {
echo "No keywords extracted. Try adjusting minScore or maxKeywords.\n\n";
}
$algorithms = [
'YAKE' => KeywordAlgorithm::YAKE,
'TextRank' => KeywordAlgorithm::TEXT_RANK,
'TF-IDF' => KeywordAlgorithm::TF_IDF,
];
echo "Algorithm Comparison:\n";
echo str_repeat('=', 60) . "\n";
foreach ($algorithms as $name => $algorithm) {
$algoConfig = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: $algorithm,
maxKeywords: 5,
minScore: 0.2
)
);
$kreuzberg = new Kreuzberg($algoConfig);
$result = $kreuzberg->extractFile('article.pdf');
$keywords = $result->metadata['keywords'] ?? [];
echo "$name algorithm:\n";
if (!empty($keywords)) {
foreach ($keywords as $keyword) {
echo " - {$keyword['text']} ({$keyword['score']})\n";
}
} else {
echo " No keywords extracted\n";
}
echo "\n";
}
function categorizeDocument(array $keywords): string
{
$categories = [
'technical' => ['algorithm', 'system', 'implementation', 'performance', 'architecture'],
'business' => ['revenue', 'market', 'customer', 'strategy', 'investment'],
'scientific' => ['research', 'study', 'analysis', 'experiment', 'hypothesis'],
'legal' => ['contract', 'agreement', 'liability', 'clause', 'provision'],
];
$scores = [];
foreach ($categories as $category => $terms) {
$scores[$category] = 0;
foreach ($keywords as $keyword) {
$keywordText = strtolower($keyword['text'] ?? '');
$keywordScore = $keyword['score'] ?? 0.0;
foreach ($terms as $term) {
if (str_contains($keywordText, $term)) {
$scores[$category] += $keywordScore;
}
}
}
}
arsort($scores);
$topCategory = array_key_first($scores);
return $topCategory ?? 'uncategorized';
}
if (!empty($keywords)) {
$category = categorizeDocument($keywords);
echo "Document Category: " . ucfirst($category) . "\n\n";
}
$documents = [
'tech_article.pdf',
'business_report.pdf',
'research_paper.pdf',
];
$keywordConfig = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: KeywordAlgorithm::YAKE,
maxKeywords: 8,
minScore: 0.25
)
);
$kreuzberg = new Kreuzberg($keywordConfig);
echo "Batch Keyword Extraction:\n";
echo str_repeat('=', 60) . "\n";
foreach ($documents as $document) {
if (!file_exists($document)) {
echo "$document: File not found\n\n";
continue;
}
$result = $kreuzberg->extractFile($document);
$keywords = $result->metadata['keywords'] ?? [];
echo basename($document) . ":\n";
if (!empty($keywords)) {
$topKeywords = array_slice($keywords, 0, 5);
$keywordTexts = array_column($topKeywords, 'text');
echo " Top keywords: " . implode(', ', $keywordTexts) . "\n";
$category = categorizeDocument($keywords);
echo " Category: " . ucfirst($category) . "\n";
} else {
echo " No keywords extracted\n";
}
echo "\n";
}
$keywordIndex = [];
foreach ($documents as $document) {
if (!file_exists($document)) {
continue;
}
$result = $kreuzberg->extractFile($document);
$keywords = $result->metadata['keywords'] ?? [];
foreach ($keywords as $keyword) {
$text = strtolower($keyword['text'] ?? '');
if (!isset($keywordIndex[$text])) {
$keywordIndex[$text] = [];
}
$keywordIndex[$text][] = basename($document);
}
}
echo "Keyword Index (for search):\n";
echo str_repeat('=', 60) . "\n";
foreach (array_slice($keywordIndex, 0, 10) as $keyword => $docs) {
echo "$keyword: " . implode(', ', array_unique($docs)) . "\n";
}
```

View File

@@ -0,0 +1,197 @@
```php title="language_detection.php"
<?php
declare(strict_types=1);
/**
* Language Detection
*
* Automatically detect the language of extracted document content.
* Useful for routing documents to language-specific processing pipelines.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.9,
detectMultiple: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Language Detection Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: document.pdf\n";
echo "Content length: " . strlen($result->content) . " characters\n\n";
$detectedLanguages = $result->detectedLanguages ?? [];
if (!empty($detectedLanguages)) {
echo "Detected languages: " . implode(', ', $detectedLanguages) . "\n";
$primaryLanguage = $detectedLanguages[0];
echo "Primary language: $primaryLanguage\n\n";
if (isset($result->metadata['language_confidence'])) {
echo "Language confidence scores:\n";
foreach ($result->metadata['language_confidence'] as $lang => $confidence) {
echo sprintf(" %-10s: %.1f%%\n", $lang, $confidence * 100);
}
echo "\n";
}
} else {
echo "No language detected or confidence too low.\n";
echo "Try lowering minConfidence threshold.\n\n";
}
if (!empty($detectedLanguages)) {
$primaryLanguage = $detectedLanguages[0];
match ($primaryLanguage) {
'en', 'eng' => print("Processing as English document...\n"),
'es', 'spa' => print("Processing as Spanish document...\n"),
'fr', 'fra' => print("Processing as French document...\n"),
'de', 'deu' => print("Processing as German document...\n"),
'zh', 'zho' => print("Processing as Chinese document...\n"),
default => print("Processing as $primaryLanguage document...\n"),
};
}
echo "\n" . str_repeat('=', 60) . "\n";
echo "Testing Different Confidence Thresholds:\n";
echo str_repeat('=', 60) . "\n";
$thresholds = [0.5, 0.7, 0.9, 0.95];
foreach ($thresholds as $threshold) {
$thresholdConfig = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: $threshold,
detectMultiple: true
)
);
$kreuzberg = new Kreuzberg($thresholdConfig);
$result = $kreuzberg->extractFile('document.pdf');
$languages = $result->detectedLanguages ?? [];
echo sprintf("Threshold %.2f: ", $threshold);
if (!empty($languages)) {
echo implode(', ', $languages) . "\n";
} else {
echo "No languages detected\n";
}
}
function getLanguageName(string $code): string
{
$languageNames = [
'en' => 'English',
'es' => 'Spanish',
'fr' => 'French',
'de' => 'German',
'it' => 'Italian',
'pt' => 'Portuguese',
'ru' => 'Russian',
'zh' => 'Chinese',
'ja' => 'Japanese',
'ko' => 'Korean',
'ar' => 'Arabic',
'hi' => 'Hindi',
'nl' => 'Dutch',
'pl' => 'Polish',
'tr' => 'Turkish',
];
return $languageNames[$code] ?? ucfirst($code);
}
echo "\n" . str_repeat('=', 60) . "\n";
echo "Detected Languages (Full Names):\n";
echo str_repeat('=', 60) . "\n";
if (!empty($detectedLanguages)) {
foreach ($detectedLanguages as $langCode) {
echo " - " . getLanguageName($langCode) . " ($langCode)\n";
}
} else {
echo "No languages detected.\n";
}
$documents = [
'english_doc.pdf',
'spanish_doc.pdf',
'german_doc.pdf',
];
echo "\n" . str_repeat('=', 60) . "\n";
echo "Batch Language Detection:\n";
echo str_repeat('=', 60) . "\n";
$detectionConfig = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.8,
detectMultiple: false
)
);
$kreuzberg = new Kreuzberg($detectionConfig);
foreach ($documents as $document) {
if (!file_exists($document)) {
echo basename($document) . ": File not found\n";
continue;
}
$result = $kreuzberg->extractFile($document);
$languages = $result->detectedLanguages ?? [];
echo basename($document) . ": ";
if (!empty($languages)) {
$primaryLang = $languages[0];
echo getLanguageName($primaryLang) . " ($primaryLang)\n";
} else {
echo "Language not detected\n";
}
}
function routeDocumentByLanguage(string $filePath, array $detectedLanguages): string
{
if (empty($detectedLanguages)) {
return 'default_queue';
}
$primaryLanguage = $detectedLanguages[0];
return match ($primaryLanguage) {
'en', 'eng' => 'english_processing_queue',
'es', 'spa' => 'spanish_processing_queue',
'fr', 'fra' => 'french_processing_queue',
'de', 'deu' => 'german_processing_queue',
'zh', 'zho', 'ja', 'jpn', 'ko', 'kor' => 'cjk_processing_queue',
'ar', 'ara', 'he', 'heb' => 'rtl_processing_queue',
default => 'multilingual_queue',
};
}
echo "\n" . str_repeat('=', 60) . "\n";
echo "Document Routing Based on Language:\n";
echo str_repeat('=', 60) . "\n";
if (!empty($detectedLanguages)) {
$queue = routeDocumentByLanguage('document.pdf', $detectedLanguages);
echo "Document routed to: $queue\n";
}
```

View File

@@ -0,0 +1,236 @@
```php title="language_detection_multilingual.php"
<?php
declare(strict_types=1);
/**
* Multilingual Document Language Detection
*
* Detect multiple languages in documents that contain mixed-language content.
* Useful for processing multilingual documents, translations, and international content.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.7,
detectMultiple: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('multilingual_document.pdf');
echo "Multilingual Language Detection:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: multilingual_document.pdf\n\n";
$detectedLanguages = $result->detectedLanguages ?? [];
$languageCount = count($detectedLanguages);
echo "Detected $languageCount language(s): " . implode(', ', $detectedLanguages) . "\n\n";
if ($languageCount > 1) {
echo "This is a multilingual document.\n";
echo "Languages present:\n";
foreach ($detectedLanguages as $index => $language) {
$label = $index === 0 ? 'Primary' : 'Secondary';
echo " $label: $language\n";
}
echo "\n";
} elseif ($languageCount === 1) {
echo "This is a monolingual document.\n";
echo "Language: {$detectedLanguages[0]}\n\n";
} else {
echo "No languages detected.\n\n";
}
if (isset($result->metadata['language_distribution'])) {
echo "Language Distribution:\n";
echo str_repeat('-', 40) . "\n";
foreach ($result->metadata['language_distribution'] as $lang => $percentage) {
$barLength = (int)($percentage * 40);
$bar = str_repeat('█', $barLength);
echo sprintf(
" %-10s [%-40s] %5.1f%%\n",
$lang,
$bar,
$percentage * 100
);
}
echo "\n";
}
function categorizeMultilingualDocument(array $languages): string
{
$count = count($languages);
if ($count === 0) {
return 'unknown';
}
if ($count === 1) {
return 'monolingual';
}
if ($count === 2) {
sort($languages);
$pair = implode('-', $languages);
$commonPairs = [
'en-es' => 'English-Spanish bilingual',
'en-fr' => 'English-French bilingual',
'en-de' => 'English-German bilingual',
'en-zh' => 'English-Chinese bilingual',
];
return $commonPairs[$pair] ?? 'bilingual';
}
return 'multilingual';
}
$docType = categorizeMultilingualDocument($detectedLanguages);
echo "Document type: $docType\n\n";
if ($languageCount > 1) {
echo "Multilingual Processing Recommendations:\n";
echo str_repeat('=', 60) . "\n";
echo "1. Consider splitting content by language\n";
echo "2. Use language-specific OCR models if available\n";
echo "3. Apply appropriate tokenization for each language\n";
echo "4. Use multilingual embedding models for semantic search\n\n";
}
function extractLanguageSections(string $content, array $languages): array
{
$sections = [];
$lines = explode("\n", $content);
$currentLang = $languages[0] ?? 'unknown';
foreach ($lines as $line) {
if (empty(trim($line))) {
continue;
}
if (!isset($sections[$currentLang])) {
$sections[$currentLang] = [];
}
$sections[$currentLang][] = $line;
}
return $sections;
}
$testDocuments = [
'english_only.pdf',
'spanish_english.pdf',
'multilingual_eu.pdf',
];
echo "Batch Multilingual Analysis:\n";
echo str_repeat('=', 60) . "\n";
$multilingualConfig = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.6,
detectMultiple: true
)
);
$kreuzberg = new Kreuzberg($multilingualConfig);
$statistics = [
'monolingual' => 0,
'bilingual' => 0,
'multilingual' => 0,
];
foreach ($testDocuments as $document) {
if (!file_exists($document)) {
echo basename($document) . ": File not found\n";
continue;
}
$result = $kreuzberg->extractFile($document);
$languages = $result->detectedLanguages ?? [];
$type = categorizeMultilingualDocument($languages);
echo basename($document) . ":\n";
echo " Languages: " . implode(', ', $languages) . "\n";
echo " Type: $type\n\n";
if (count($languages) === 1) {
$statistics['monolingual']++;
} elseif (count($languages) === 2) {
$statistics['bilingual']++;
} elseif (count($languages) > 2) {
$statistics['multilingual']++;
}
}
echo "Statistics:\n";
echo " Monolingual: {$statistics['monolingual']}\n";
echo " Bilingual: {$statistics['bilingual']}\n";
echo " Multilingual: {$statistics['multilingual']}\n\n";
function analyzeLanguagePairs(array $documents, Kreuzberg $kreuzberg): array
{
$pairs = [];
foreach ($documents as $document) {
if (!file_exists($document)) {
continue;
}
$result = $kreuzberg->extractFile($document);
$languages = $result->detectedLanguages ?? [];
if (count($languages) >= 2) {
sort($languages);
$pair = implode('-', array_slice($languages, 0, 2));
if (!isset($pairs[$pair])) {
$pairs[$pair] = 0;
}
$pairs[$pair]++;
}
}
arsort($pairs);
return $pairs;
}
$translationPairs = [
'en-es' => 'English ↔ Spanish',
'en-fr' => 'English ↔ French',
'en-de' => 'English ↔ German',
'en-zh' => 'English ↔ Chinese',
'en-ja' => 'English ↔ Japanese',
];
echo "Common Translation Pairs:\n";
echo str_repeat('=', 60) . "\n";
foreach ($translationPairs as $code => $name) {
echo " $code: $name\n";
}
echo "\nUse these configurations for translation document processing.\n";
```

View File

@@ -0,0 +1,203 @@
```php title="quality_processing_example.php"
<?php
declare(strict_types=1);
/**
* Quality Processing Example
*
* Enable quality processing to assess and improve extraction quality.
* Useful for detecting low-quality scans and suggesting improvements.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
$config = new ExtractionConfig(
enableQualityProcessing: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('scanned_document.pdf');
echo "Quality Processing Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: scanned_document.pdf\n";
echo "Content length: " . strlen($result->content) . " characters\n\n";
$qualityScore = $result->qualityScore ?? null;
if ($qualityScore !== null) {
echo "Quality Score: " . number_format($qualityScore, 2) . "\n";
echo "Rating: ";
if ($qualityScore >= 0.8) {
echo "Excellent\n";
echo "Status: ✓ Ready for production use\n";
} elseif ($qualityScore >= 0.6) {
echo "Good\n";
echo "Status: ✓ Acceptable quality\n";
} elseif ($qualityScore >= 0.5) {
echo "Fair\n";
echo "Status: ⚠ May require review\n";
} else {
echo "Poor\n";
echo "Status: ✗ Requires attention\n";
}
echo "\n";
if ($qualityScore < 0.5) {
echo "Recommendations for Improvement:\n";
echo str_repeat('-', 40) . "\n";
echo "1. Re-scan with higher DPI (300+ recommended)\n";
echo "2. Ensure original is clean and well-lit\n";
echo "3. Adjust OCR preprocessing settings:\n";
echo " - Enable denoising\n";
echo " - Enable deskewing\n";
echo " - Increase contrast enhancement\n";
echo "4. Try different binarization methods\n";
echo "5. Consider manual review and correction\n\n";
}
} else {
echo "Quality score not available.\n";
echo "Enable quality processing in configuration.\n\n";
}
if (isset($result->metadata['ocr_confidence'])) {
$ocrConfidence = $result->metadata['ocr_confidence'];
echo "OCR Confidence: " . number_format($ocrConfidence * 100, 1) . "%\n\n";
if ($ocrConfidence < 0.7) {
echo "⚠ Low OCR confidence detected.\n";
echo "The extracted text may contain errors.\n\n";
}
}
if (isset($result->metadata['quality_metrics'])) {
echo "Detailed Quality Metrics:\n";
echo str_repeat('-', 40) . "\n";
$metrics = $result->metadata['quality_metrics'];
foreach ($metrics as $metric => $value) {
$formattedValue = is_numeric($value)
? number_format($value, 3)
: $value;
echo sprintf(" %-25s: %s\n", ucwords(str_replace('_', ' ', $metric)), $formattedValue);
}
echo "\n";
}
$documents = [
'high_quality_scan.pdf',
'medium_quality_scan.pdf',
'low_quality_scan.pdf',
];
echo "Batch Quality Analysis:\n";
echo str_repeat('=', 60) . "\n";
$qualityConfig = new ExtractionConfig(
enableQualityProcessing: true
);
$kreuzberg = new Kreuzberg($qualityConfig);
$qualityResults = [];
foreach ($documents as $document) {
if (!file_exists($document)) {
echo basename($document) . ": File not found\n\n";
continue;
}
$result = $kreuzberg->extractFile($document);
$score = $result->qualityScore ?? 0.0;
$qualityResults[$document] = [
'score' => $score,
'content_length' => strlen($result->content),
'result' => $result,
];
echo basename($document) . ":\n";
echo " Quality score: " . number_format($score, 2) . "\n";
echo " Content length: " . strlen($result->content) . " chars\n";
$indicator = match(true) {
$score >= 0.8 => '✓ Excellent',
$score >= 0.6 => '✓ Good',
$score >= 0.5 => '⚠ Fair',
default => '✗ Poor',
};
echo " Status: $indicator\n\n";
}
if (!empty($qualityResults)) {
$scores = array_column($qualityResults, 'score');
$avgScore = array_sum($scores) / count($scores);
$maxScore = max($scores);
$minScore = min($scores);
echo "Quality Statistics:\n";
echo str_repeat('-', 40) . "\n";
echo " Average: " . number_format($avgScore, 2) . "\n";
echo " Highest: " . number_format($maxScore, 2) . "\n";
echo " Lowest: " . number_format($minScore, 2) . "\n\n";
$lowQualityDocs = array_filter(
$qualityResults,
fn($result) => $result['score'] < 0.5
);
if (!empty($lowQualityDocs)) {
echo "Documents Requiring Attention:\n";
echo str_repeat('-', 40) . "\n";
foreach ($lowQualityDocs as $doc => $data) {
echo " - " . basename($doc) . " (score: " . number_format($data['score'], 2) . ")\n";
}
echo "\n";
}
}
function needsReprocessing(float $qualityScore, int $contentLength): bool
{
return $qualityScore < 0.5 || $contentLength < 100;
}
function routeDocumentByQuality(string $filePath, float $qualityScore): string
{
return match(true) {
$qualityScore >= 0.8 => 'auto_processing_queue',
$qualityScore >= 0.6 => 'standard_review_queue',
$qualityScore >= 0.5 => 'detailed_review_queue',
default => 'manual_review_queue',
};
}
echo "Document Routing Based on Quality:\n";
echo str_repeat('=', 60) . "\n";
foreach ($qualityResults as $doc => $data) {
$queue = routeDocumentByQuality($doc, $data['score']);
$reprocess = needsReprocessing($data['score'], $data['content_length']);
echo basename($doc) . ":\n";
echo " Route to: $queue\n";
if ($reprocess) {
echo " Action: Reprocess with enhanced settings\n";
} else {
echo " Action: Continue standard workflow\n";
}
echo "\n";
}
```

View File

@@ -0,0 +1,20 @@
```php
<?php
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\EmbeddingConfig;
use Kreuzberg\Config\EmbeddingModelType;
$kreuzberg = new Kreuzberg();
// Embed with default config (balanced preset)
$embeddings = $kreuzberg->embed(["Hello world", "How are you?"]);
// Embed with specific preset
$config = new EmbeddingConfig(model: EmbeddingModelType::preset("fast"));
$embeddings = $kreuzberg->embed(["Hello world"], $config);
// Each embedding is a float array
foreach ($embeddings as $i => $vector) {
echo "Text $i: " . count($vector) . " dimensions\n";
}
```

View File

@@ -0,0 +1,237 @@
```php title="tables.php"
<?php
declare(strict_types=1);
/**
* Table Extraction and Processing
*
* Extract tables from documents and convert them to various formats.
* Demonstrates table processing, formatting, and export capabilities.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Result\ExtractedTable;
$config = new ExtractionConfig(
extractTables: true
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Table Extraction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Tables found: " . count($result->tables) . "\n\n";
foreach ($result->tables as $tableIndex => $table) {
echo "Table " . ($tableIndex + 1) . ":\n";
echo str_repeat('-', 40) . "\n";
$rowCount = count($table->cells);
$colCount = !empty($table->cells) ? count($table->cells[0]) : 0;
echo " Dimensions: $rowCount rows × $colCount columns\n";
if (isset($table->pageNumber)) {
echo " Page: {$table->pageNumber}\n";
}
echo "\n";
echo " Markdown representation:\n";
echo str_repeat('-', 40) . "\n";
echo $table->markdown . "\n\n";
echo " Raw data preview:\n";
echo str_repeat('-', 40) . "\n";
$previewRows = array_slice($table->cells, 0, 3);
foreach ($previewRows as $rowIndex => $row) {
echo " Row " . ($rowIndex + 1) . ": [" . implode(' | ', $row) . "]\n";
}
if ($rowCount > 3) {
echo " ... and " . ($rowCount - 3) . " more rows\n";
}
echo "\n";
}
echo "Exporting Tables to CSV:\n";
echo str_repeat('=', 60) . "\n";
$outputDir = './exported_tables';
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.csv', $index + 1);
$filepath = $outputDir . '/' . $filename;
$fp = fopen($filepath, 'w');
if ($fp !== false) {
foreach ($table->cells as $row) {
fputcsv($fp, $row);
}
fclose($fp);
echo "Saved: $filename\n";
} else {
echo "Error: Failed to create $filename\n";
}
}
echo "\n";
echo "Exporting Tables to JSON:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.json', $index + 1);
$filepath = $outputDir . '/' . $filename;
$tableData = [
'index' => $index + 1,
'page' => $table->pageNumber ?? null,
'dimensions' => [
'rows' => count($table->cells),
'columns' => !empty($table->cells) ? count($table->cells[0]) : 0,
],
'data' => $table->cells,
'markdown' => $table->markdown,
];
$json = json_encode($tableData, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
file_put_contents($filepath, $json);
echo "Saved: $filename\n";
}
echo "\n";
function tableToHtml(ExtractedTable $table): string
{
$html = "<table>\n";
foreach ($table->cells as $rowIndex => $row) {
$html .= " <tr>\n";
$tag = $rowIndex === 0 ? 'th' : 'td';
foreach ($row as $cell) {
$escapedCell = htmlspecialchars($cell, ENT_QUOTES, 'UTF-8');
$html .= " <$tag>$escapedCell</$tag>\n";
}
$html .= " </tr>\n";
}
$html .= "</table>";
return $html;
}
echo "Exporting Tables to HTML:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
$filename = sprintf('table_%d.html', $index + 1);
$filepath = $outputDir . '/' . $filename;
$html = "<!DOCTYPE html>\n";
$html .= "<html>\n<head>\n";
$html .= " <meta charset=\"UTF-8\">\n";
$html .= " <title>Table " . ($index + 1) . "</title>\n";
$html .= " <style>\n";
$html .= " table { border-collapse: collapse; width: 100%; }\n";
$html .= " th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n";
$html .= " th { background-color: #f2f2f2; }\n";
$html .= " </style>\n";
$html .= "</head>\n<body>\n";
$html .= " <h1>Table " . ($index + 1) . "</h1>\n";
$html .= tableToHtml($table) . "\n";
$html .= "</body>\n</html>";
file_put_contents($filepath, $html);
echo "Saved: $filename\n";
}
echo "\n";
echo "Table Analysis:\n";
echo str_repeat('=', 60) . "\n";
foreach ($result->tables as $index => $table) {
echo "Table " . ($index + 1) . " Analysis:\n";
$cells = $table->cells;
$totalCells = array_sum(array_map('count', $cells));
$emptyCells = 0;
$numericCells = 0;
foreach ($cells as $row) {
foreach ($row as $cell) {
if (empty(trim($cell))) {
$emptyCells++;
}
if (is_numeric($cell)) {
$numericCells++;
}
}
}
echo " Total cells: $totalCells\n";
echo " Empty cells: $emptyCells (" . number_format(($emptyCells / max($totalCells, 1)) * 100, 1) . "%)\n";
echo " Numeric cells: $numericCells (" . number_format(($numericCells / max($totalCells, 1)) * 100, 1) . "%)\n";
$numericRatio = $numericCells / max($totalCells, 1);
$tableType = match(true) {
$numericRatio > 0.5 => 'Data/Numeric Table',
$numericRatio > 0.2 => 'Mixed Content Table',
default => 'Text Table',
};
echo " Table type: $tableType\n\n";
}
function tableToAssociativeArray(ExtractedTable $table): array
{
if (empty($table->cells)) {
return [];
}
$headers = array_shift($table->cells);
$data = [];
foreach ($table->cells as $row) {
$rowData = [];
foreach ($headers as $index => $header) {
$rowData[$header] = $row[$index] ?? '';
}
$data[] = $rowData;
}
return $data;
}
if (!empty($result->tables)) {
$firstTable = $result->tables[0];
$associativeData = tableToAssociativeArray($firstTable);
echo "First Table as Associative Array:\n";
echo str_repeat('=', 60) . "\n";
echo json_encode(array_slice($associativeData, 0, 3), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . "\n";
if (count($associativeData) > 3) {
echo "... and " . (count($associativeData) - 3) . " more records\n";
}
}
```

View File

@@ -0,0 +1,169 @@
```php title="token_reduction.php"
<?php
declare(strict_types=1);
/**
* Token Reduction Configuration
*
* Configure token reduction to compress extracted content while preserving meaning.
* Useful for reducing token costs in LLM applications and staying within token limits.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\TokenReductionConfig;
$config = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: 'moderate',
preserveImportantWords: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Token Reduction Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Content length: " . strlen($result->content) . " characters\n\n";
if (isset($result->metadata['original_token_count'])) {
$originalTokens = $result->metadata['original_token_count'];
$reducedTokens = $result->metadata['token_count'] ?? strlen($result->content);
$reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
echo "Token Reduction Statistics:\n";
echo str_repeat('-', 40) . "\n";
echo " Original tokens: " . number_format($originalTokens) . "\n";
echo " Reduced tokens: " . number_format($reducedTokens) . "\n";
echo " Reduction: " . number_format($reductionRatio * 100, 1) . "%\n";
echo " Tokens saved: " . number_format($originalTokens - $reducedTokens) . "\n\n";
}
$modes = [
'light' => 'Light reduction - minimal changes',
'moderate' => 'Moderate reduction - balanced',
'aggressive' => 'Aggressive reduction - maximum compression',
];
echo "Token Reduction Mode Comparison:\n";
echo str_repeat('=', 60) . "\n";
$comparisonResults = [];
foreach ($modes as $mode => $description) {
$modeConfig = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: $mode,
preserveImportantWords: true
)
);
$kreuzberg = new Kreuzberg($modeConfig);
$result = $kreuzberg->extractFile('sample.pdf');
$contentLength = strlen($result->content);
$tokenCount = $result->metadata['token_count'] ?? $contentLength;
$comparisonResults[$mode] = [
'length' => $contentLength,
'tokens' => $tokenCount,
'content' => substr($result->content, 0, 100),
];
echo "$mode mode:\n";
echo " Description: $description\n";
echo " Content length: " . number_format($contentLength) . " characters\n";
echo " Estimated tokens: " . number_format($tokenCount) . "\n";
echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";
}
if (count($comparisonResults) > 1) {
$lightLength = $comparisonResults['light']['length'] ?? 0;
$aggressiveLength = $comparisonResults['aggressive']['length'] ?? 0;
if ($lightLength > 0) {
$savings = (($lightLength - $aggressiveLength) / $lightLength) * 100;
echo "Comparison Summary:\n";
echo str_repeat('-', 40) . "\n";
echo "Aggressive vs Light mode saves: " . number_format($savings, 1) . "%\n\n";
}
}
$advancedConfig = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: 'moderate',
preserveImportantWords: true,
preserveMarkdown: true,
preserveNumbers: true,
removeStopWords: true
)
);
$kreuzberg = new Kreuzberg($advancedConfig);
$result = $kreuzberg->extractFile('verbose_document.pdf');
echo "Advanced Token Reduction:\n";
echo str_repeat('=', 60) . "\n";
echo "Configuration:\n";
echo " - Preserve important words: Yes\n";
echo " - Preserve markdown: Yes\n";
echo " - Preserve numbers: Yes\n";
echo " - Remove stop words: Yes\n\n";
echo "Result:\n";
echo " Content length: " . strlen($result->content) . " characters\n";
if (isset($result->metadata['token_reduction_ratio'])) {
echo " Reduction ratio: " . number_format($result->metadata['token_reduction_ratio'] * 100, 1) . "%\n";
}
echo "\n";
function estimateTokenCost(int $tokens, float $pricePerMillion = 0.50): float
{
return ($tokens / 1_000_000) * $pricePerMillion;
}
echo "Cost Estimation (based on reduction):\n";
echo str_repeat('=', 60) . "\n";
foreach ($comparisonResults as $mode => $data) {
$tokens = $data['tokens'];
$cost = estimateTokenCost($tokens);
echo ucfirst($mode) . " mode:\n";
echo " Tokens: " . number_format($tokens) . "\n";
echo " Estimated cost: $" . number_format($cost, 4) . "\n\n";
}
function chooseReductionMode(int $maxTokens, int $estimatedTokens): string
{
$ratio = $estimatedTokens / $maxTokens;
return match(true) {
$ratio <= 1.0 => 'none',
$ratio <= 1.3 => 'light',
$ratio <= 1.7 => 'moderate',
default => 'aggressive',
};
}
$maxTokenLimit = 8000;
$documentTokens = 12000;
$recommendedMode = chooseReductionMode($maxTokenLimit, $documentTokens);
echo "Reduction Mode Recommendation:\n";
echo str_repeat('=', 60) . "\n";
echo "Document tokens: " . number_format($documentTokens) . "\n";
echo "Token limit: " . number_format($maxTokenLimit) . "\n";
echo "Recommended mode: $recommendedMode\n";
echo "Reason: " . ($documentTokens > $maxTokenLimit
? "Document exceeds limit by " . number_format($documentTokens - $maxTokenLimit) . " tokens"
: "Document within limits") . "\n";
```

View File

@@ -0,0 +1,222 @@
```php title="token_reduction_example.php"
<?php
declare(strict_types=1);
/**
* Token Reduction Example
*
* Practical example of using token reduction to fit documents within token limits.
* Demonstrates tracking reduction statistics and optimizing for LLM usage.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\TokenReductionConfig;
$config = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: 'moderate',
preserveMarkdown: true
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('verbose_document.pdf');
echo "Token Reduction Example:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: verbose_document.pdf\n\n";
if (isset($result->metadata['original_token_count'])) {
$originalTokens = $result->metadata['original_token_count'];
$reducedTokens = $result->metadata['token_count'];
$reductionRatio = $result->metadata['token_reduction_ratio'];
echo "Token Reduction Statistics:\n";
echo str_repeat('-', 40) . "\n";
echo sprintf(" Before: %s tokens\n", number_format($originalTokens));
echo sprintf(" After: %s tokens\n", number_format($reducedTokens));
echo sprintf(" Reduction: %.1f%%\n", $reductionRatio * 100);
echo sprintf(" Saved: %s tokens\n\n", number_format($originalTokens - $reducedTokens));
$beforeBar = str_repeat('█', (int)($originalTokens / 100));
$afterBar = str_repeat('█', (int)($reducedTokens / 100));
echo "Visual comparison (each █ = ~100 tokens):\n";
echo " Before: $beforeBar\n";
echo " After: $afterBar\n\n";
}
echo "Content Analysis:\n";
echo str_repeat('-', 40) . "\n";
echo " Content length: " . strlen($result->content) . " characters\n";
echo " First 200 chars: " . substr($result->content, 0, 200) . "...\n\n";
$documents = [
'long_article.pdf',
'research_paper.pdf',
'technical_doc.pdf',
];
echo "Batch Token Reduction:\n";
echo str_repeat('=', 60) . "\n";
$batchConfig = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: 'moderate',
preserveImportantWords: true,
preserveMarkdown: true
)
);
$kreuzberg = new Kreuzberg($batchConfig);
$totalOriginal = 0;
$totalReduced = 0;
foreach ($documents as $document) {
if (!file_exists($document)) {
echo basename($document) . ": File not found\n\n";
continue;
}
$result = $kreuzberg->extractFile($document);
$originalTokens = $result->metadata['original_token_count'] ?? 0;
$reducedTokens = $result->metadata['token_count'] ?? 0;
$reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
$totalOriginal += $originalTokens;
$totalReduced += $reducedTokens;
echo basename($document) . ":\n";
echo sprintf(" Original: %s tokens\n", number_format($originalTokens));
echo sprintf(" Reduced: %s tokens\n", number_format($reducedTokens));
echo sprintf(" Saved: %.1f%%\n\n", $reductionRatio * 100);
}
if ($totalOriginal > 0) {
$overallReduction = (($totalOriginal - $totalReduced) / $totalOriginal) * 100;
echo "Overall Statistics:\n";
echo str_repeat('-', 40) . "\n";
echo sprintf(" Total original: %s tokens\n", number_format($totalOriginal));
echo sprintf(" Total reduced: %s tokens\n", number_format($totalReduced));
echo sprintf(" Overall saving: %.1f%%\n\n", $overallReduction);
}
function fitWithinTokenLimit(
string $filePath,
int $maxTokens,
Kreuzberg $kreuzberg
): ?array {
$modes = ['light', 'moderate', 'aggressive'];
foreach ($modes as $mode) {
$config = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: $mode,
preserveImportantWords: true
)
);
$kreuzbergWithMode = new Kreuzberg($config);
$result = $kreuzbergWithMode->extractFile($filePath);
$tokens = $result->metadata['token_count'] ?? strlen($result->content);
if ($tokens <= $maxTokens) {
return [
'mode' => $mode,
'tokens' => $tokens,
'result' => $result,
'fits' => true,
];
}
}
$config = new ExtractionConfig(
tokenReduction: new TokenReductionConfig(
mode: 'aggressive',
preserveImportantWords: true
)
);
$kreuzbergWithMode = new Kreuzberg($config);
$result = $kreuzbergWithMode->extractFile($filePath);
$tokens = $result->metadata['token_count'] ?? strlen($result->content);
return [
'mode' => 'aggressive',
'tokens' => $tokens,
'result' => $result,
'fits' => false,
];
}
echo "Fitting Document to Token Limit:\n";
echo str_repeat('=', 60) . "\n";
$tokenLimit = 8000;
$testFile = 'large_document.pdf';
if (file_exists($testFile)) {
$fitResult = fitWithinTokenLimit($testFile, $tokenLimit, $kreuzberg);
echo "Target limit: " . number_format($tokenLimit) . " tokens\n";
echo "Reduction mode used: {$fitResult['mode']}\n";
echo "Final token count: " . number_format($fitResult['tokens']) . "\n";
if ($fitResult['fits']) {
echo "Status: ✓ Successfully fits within limit\n";
$remaining = $tokenLimit - $fitResult['tokens'];
echo "Tokens remaining: " . number_format($remaining) . "\n";
} else {
echo "Status: ✗ Still exceeds limit\n";
$excess = $fitResult['tokens'] - $tokenLimit;
echo "Tokens over limit: " . number_format($excess) . "\n";
echo "Suggestion: Consider chunking the document\n";
}
echo "\n";
}
function calculateCostSavings(
int $originalTokens,
int $reducedTokens,
float $pricePerMillion = 0.50
): array {
$originalCost = ($originalTokens / 1_000_000) * $pricePerMillion;
$reducedCost = ($reducedTokens / 1_000_000) * $pricePerMillion;
$savings = $originalCost - $reducedCost;
$savingsPercent = ($savings / max($originalCost, 0.000001)) * 100;
return [
'original_cost' => $originalCost,
'reduced_cost' => $reducedCost,
'savings' => $savings,
'savings_percent' => $savingsPercent,
];
}
if ($totalOriginal > 0 && $totalReduced > 0) {
$savings = calculateCostSavings($totalOriginal, $totalReduced);
echo "Cost Analysis:\n";
echo str_repeat('=', 60) . "\n";
echo "Price: $0.50 per million tokens (example)\n\n";
echo sprintf(" Original cost: $%.6f\n", $savings['original_cost']);
echo sprintf(" Reduced cost: $%.6f\n", $savings['reduced_cost']);
echo sprintf(" Savings: $%.6f (%.1f%%)\n\n", $savings['savings'], $savings['savings_percent']);
$documentsPerDay = 100;
$daysPerYear = 365;
$annualSavings = $savings['savings'] * $documentsPerDay * $daysPerYear;
echo "Projected Annual Savings:\n";
echo " Documents per day: $documentsPerDay\n";
echo " Annual savings: $" . number_format($annualSavings, 2) . "\n";
}
```

View File

@@ -0,0 +1,284 @@
```php title="vector_database_integration.php"
<?php
declare(strict_types=1);
/**
* Vector Database Integration
*
* Extract documents with chunking and embeddings for vector database storage.
* Demonstrates preparing data for semantic search and RAG applications.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\ChunkingConfig;
use Kreuzberg\Config\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 512,
maxOverlap: 50,
embedding: new EmbeddingConfig(
model: 'balanced',
normalize: true
)
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile('document.pdf');
echo "Vector Database Integration:\n";
echo str_repeat('=', 60) . "\n";
echo "Document: document.pdf\n";
echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
$vectorRecords = [];
foreach ($result->chunks ?? [] as $index => $chunk) {
if ($chunk->embedding === null) {
continue;
}
$chunkId = sprintf(
'doc_%s_chunk_%d',
md5('document.pdf'),
$index
);
$vectorRecords[] = [
'id' => $chunkId,
'content' => $chunk->content,
'embedding' => $chunk->embedding,
'metadata' => [
'source_file' => 'document.pdf',
'chunk_index' => $index,
'chunk_length' => strlen($chunk->content),
'embedding_model' => 'balanced',
'created_at' => date('c'),
],
];
}
echo "Prepared " . count($vectorRecords) . " records for vector database\n\n";
if (!empty($vectorRecords)) {
echo "Sample Vector Record Structure:\n";
echo str_repeat('-', 40) . "\n";
$sample = $vectorRecords[0];
echo "ID: {$sample['id']}\n";
echo "Content preview: " . substr($sample['content'], 0, 100) . "...\n";
echo "Embedding dimensions: " . count($sample['embedding']) . "\n";
echo "Metadata keys: " . implode(', ', array_keys($sample['metadata'])) . "\n\n";
}
function insertIntoPinecone(array $records, string $namespace = 'default'): void
{
echo "Inserting into Pinecone:\n";
echo str_repeat('-', 40) . "\n";
$batches = array_chunk($records, 100);
foreach ($batches as $batchIndex => $batch) {
echo sprintf(
"Batch %d: Upserting %d vectors to namespace '%s'...\n",
$batchIndex + 1,
count($batch),
$namespace
);
}
echo "Completed inserting " . count($records) . " vectors\n\n";
}
function insertIntoWeaviate(array $records, string $className = 'Document'): void
{
echo "Inserting into Weaviate:\n";
echo str_repeat('-', 40) . "\n";
foreach ($records as $index => $record) {
$object = [
'class' => $className,
'properties' => [
'content' => $record['content'],
'sourceFile' => $record['metadata']['source_file'],
'chunkIndex' => $record['metadata']['chunk_index'],
'createdAt' => $record['metadata']['created_at'],
],
'vector' => $record['embedding'],
];
if (($index + 1) % 10 === 0) {
echo sprintf("Inserted %d/%d objects\n", $index + 1, count($records));
}
}
echo "Completed inserting " . count($records) . " objects\n\n";
}
function insertIntoQdrant(
array $records,
string $collectionName = 'documents'
): void {
echo "Inserting into Qdrant:\n";
echo str_repeat('-', 40) . "\n";
$points = [];
foreach ($records as $record) {
$points[] = [
'id' => $record['id'],
'vector' => $record['embedding'],
'payload' => [
'content' => $record['content'],
'metadata' => $record['metadata'],
],
];
}
echo sprintf(
"Upserting %d points to collection '%s'...\n",
count($points),
$collectionName
);
echo "Completed\n\n";
}
echo "Vector Database Integration Examples:\n";
echo str_repeat('=', 60) . "\n\n";
insertIntoPinecone($vectorRecords, 'documents');
insertIntoWeaviate($vectorRecords, 'DocumentChunk');
insertIntoQdrant($vectorRecords, 'document_chunks');
$documents = [
'doc1.pdf',
'doc2.pdf',
'doc3.pdf',
];
echo "Batch Processing for Vector Database:\n";
echo str_repeat('=', 60) . "\n";
$allVectorRecords = [];
$vectorConfig = new ExtractionConfig(
chunking: new ChunkingConfig(
maxChars: 512,
maxOverlap: 50,
embedding: new EmbeddingConfig(
model: 'balanced',
normalize: true
)
)
);
$kreuzberg = new Kreuzberg($vectorConfig);
foreach ($documents as $document) {
if (!file_exists($document)) {
echo basename($document) . ": File not found\n";
continue;
}
$result = $kreuzberg->extractFile($document);
echo basename($document) . ":\n";
echo " Chunks: " . count($result->chunks ?? []) . "\n";
foreach ($result->chunks ?? [] as $index => $chunk) {
if ($chunk->embedding === null) {
continue;
}
$chunkId = sprintf(
'doc_%s_chunk_%d',
md5($document),
$index
);
$allVectorRecords[] = [
'id' => $chunkId,
'content' => $chunk->content,
'embedding' => $chunk->embedding,
'metadata' => [
'source_file' => basename($document),
'chunk_index' => $index,
'chunk_length' => strlen($chunk->content),
'embedding_model' => 'balanced',
'created_at' => date('c'),
],
];
}
}
echo "\nTotal records prepared: " . count($allVectorRecords) . "\n\n";
function simulateSemanticSearch(string $query, array $records, int $topK = 5): array
{
echo "Simulating semantic search:\n";
echo " Query: \"$query\"\n";
echo " Searching " . count($records) . " vectors...\n";
echo " Top $topK results:\n\n";
$results = array_slice($records, 0, $topK);
foreach ($results as $index => $result) {
echo sprintf(
" %d. %s (score: %.3f)\n",
$index + 1,
substr($result['content'], 0, 60) . '...',
0.9 - ($index * 0.05)
);
echo sprintf(" Source: %s\n", $result['metadata']['source_file']);
echo "\n";
}
return $results;
}
if (!empty($allVectorRecords)) {
echo "Semantic Search Example:\n";
echo str_repeat('=', 60) . "\n";
simulateSemanticSearch(
"How to configure document extraction?",
$allVectorRecords,
3
);
}
function exportVectorRecordsToJson(array $records, string $filename): void
{
$data = [
'version' => '1.0',
'count' => count($records),
'generated_at' => date('c'),
'records' => $records,
];
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
file_put_contents($filename, $json);
echo "Exported " . count($records) . " vector records to: $filename\n";
}
if (!empty($allVectorRecords)) {
exportVectorRecordsToJson($allVectorRecords, 'vector_records.json');
}
```