This commit is contained in:
48
docs/snippets/php/utils/chunking.php
Normal file
48
docs/snippets/php/utils/chunking.php
Normal file
@@ -0,0 +1,48 @@
|
||||
```php title="chunking.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Text Chunking Configuration
|
||||
*
|
||||
* Configure document chunking for processing long texts into manageable pieces.
|
||||
* Useful for RAG systems, embedding generation, and token limit management.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 1500,
|
||||
maxOverlap: 200,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Chunking Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total chunks created: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $index => $chunk) {
|
||||
echo "Chunk " . ($index + 1) . ":\n";
|
||||
echo " Length: " . strlen($chunk->content) . " characters\n";
|
||||
echo " Preview: " . substr($chunk->content, 0, 100) . "...\n";
|
||||
|
||||
if ($chunk->embedding !== null) {
|
||||
echo " Embedding dimensions: " . count($chunk->embedding) . "\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
```
|
||||
80
docs/snippets/php/utils/chunking_rag.php
Normal file
80
docs/snippets/php/utils/chunking_rag.php
Normal file
@@ -0,0 +1,80 @@
|
||||
```php title="chunking_rag.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Chunking for RAG (Retrieval-Augmented Generation)
|
||||
*
|
||||
* Advanced chunking configuration optimized for RAG systems with embeddings.
|
||||
* Demonstrates how to process documents into chunks with embeddings for
|
||||
* vector database storage and semantic search.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 500,
|
||||
maxOverlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true,
|
||||
batchSize: 16
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('research_paper.pdf');
|
||||
|
||||
echo "RAG Chunking Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$chunksWithEmbeddings = [];
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding !== null) {
|
||||
$chunksWithEmbeddings[] = [
|
||||
'content' => substr($chunk->content, 0, 100) . '...',
|
||||
'embedding_dims' => count($chunk->embedding),
|
||||
'full_content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "Chunks with embeddings: " . count($chunksWithEmbeddings) . "\n\n";
|
||||
|
||||
echo "Sample chunks for vector database:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach (array_slice($chunksWithEmbeddings, 0, 3) as $index => $chunk) {
|
||||
echo "Chunk " . ($index + 1) . ":\n";
|
||||
echo " Content preview: {$chunk['content']}\n";
|
||||
echo " Embedding dimensions: {$chunk['embedding_dims']}\n";
|
||||
echo " Ready for vector DB: Yes\n\n";
|
||||
}
|
||||
|
||||
$vectorDbRecords = array_map(
|
||||
fn($chunk, $idx) => [
|
||||
'id' => sprintf('doc_%s_chunk_%d', md5('research_paper.pdf'), $idx),
|
||||
'content' => $chunk['full_content'],
|
||||
'embedding' => $chunk['embedding'],
|
||||
'metadata' => [
|
||||
'source' => 'research_paper.pdf',
|
||||
'chunk_index' => $idx,
|
||||
'char_count' => strlen($chunk['full_content']),
|
||||
],
|
||||
],
|
||||
$chunksWithEmbeddings,
|
||||
array_keys($chunksWithEmbeddings)
|
||||
);
|
||||
|
||||
echo "Prepared " . count($vectorDbRecords) . " records for vector database\n";
|
||||
echo "Each record contains: id, content, embedding, and metadata\n";
|
||||
```
|
||||
81
docs/snippets/php/utils/embedding_with_chunking.php
Normal file
81
docs/snippets/php/utils/embedding_with_chunking.php
Normal file
@@ -0,0 +1,81 @@
|
||||
```php title="embedding_with_chunking.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Embedding Generation with Chunking
|
||||
*
|
||||
* Configure chunking with automatic embedding generation for each chunk.
|
||||
* Ideal for semantic search, similarity matching, and vector databases.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 1024,
|
||||
maxOverlap: 100,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Embedding Generation Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
$chunksWithEmbeddings = 0;
|
||||
$totalEmbeddingDimensions = 0;
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding !== null) {
|
||||
$chunksWithEmbeddings++;
|
||||
$totalEmbeddingDimensions = count($chunk->embedding);
|
||||
}
|
||||
}
|
||||
|
||||
echo "Chunks with embeddings: $chunksWithEmbeddings\n";
|
||||
echo "Embedding dimensions: $totalEmbeddingDimensions\n";
|
||||
echo "Coverage: " . ($chunksWithEmbeddings > 0
|
||||
? sprintf("%.1f%%", ($chunksWithEmbeddings / count($result->chunks ?? [1])) * 100)
|
||||
: "0%") . "\n\n";
|
||||
|
||||
if (!empty($result->chunks) && $result->chunks[0]->embedding !== null) {
|
||||
$sampleChunk = $result->chunks[0];
|
||||
|
||||
echo "Sample Chunk:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content preview: " . substr($sampleChunk->content, 0, 150) . "...\n";
|
||||
echo "Content length: " . strlen($sampleChunk->content) . " chars\n";
|
||||
echo "Embedding dimensions: " . count($sampleChunk->embedding) . "\n";
|
||||
echo "First 5 embedding values: [";
|
||||
echo implode(', ', array_map(
|
||||
fn($v) => number_format($v, 4),
|
||||
array_slice($sampleChunk->embedding, 0, 5)
|
||||
));
|
||||
echo ", ...]\n\n";
|
||||
}
|
||||
|
||||
if (!empty($result->chunks)) {
|
||||
$totalChars = array_sum(array_map(
|
||||
fn($chunk) => strlen($chunk->content),
|
||||
$result->chunks
|
||||
));
|
||||
$avgChunkSize = $totalChars / count($result->chunks);
|
||||
|
||||
echo "Average chunk size: " . round($avgChunkSize) . " characters\n";
|
||||
}
|
||||
```
|
||||
114
docs/snippets/php/utils/error_handling.php
Normal file
114
docs/snippets/php/utils/error_handling.php
Normal file
@@ -0,0 +1,114 @@
|
||||
```php title="error_handling.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Comprehensive Error Handling
|
||||
*
|
||||
* Demonstrate proper error handling for document extraction operations.
|
||||
* Shows how to catch and handle different types of Kreuzberg exceptions.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
use Kreuzberg\Exceptions\ParsingException;
|
||||
use Kreuzberg\Exceptions\OcrException;
|
||||
use Kreuzberg\Exceptions\ValidationException;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
try {
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
echo "Extracted " . strlen($result->content) . " characters\n";
|
||||
} catch (ParsingException $e) {
|
||||
echo "Failed to parse document: " . $e->getMessage() . "\n";
|
||||
echo "Error code: " . $e->getCode() . "\n";
|
||||
} catch (OcrException $e) {
|
||||
echo "OCR processing failed: " . $e->getMessage() . "\n";
|
||||
echo "Suggestion: Check if document is scanned and OCR is properly configured\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Extraction error: " . $e->getMessage() . "\n";
|
||||
if ($e->getPrevious() !== null) {
|
||||
echo "Caused by: " . $e->getPrevious()->getMessage() . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$config = new ExtractionConfig();
|
||||
$pdfBytes = file_get_contents('sample.pdf');
|
||||
|
||||
if ($pdfBytes === false) {
|
||||
throw new \RuntimeException('Failed to read file');
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractBytes($pdfBytes, 'application/pdf', $config);
|
||||
echo "Extracted from bytes: " . substr($result->content, 0, 100) . "...\n";
|
||||
} catch (ValidationException $e) {
|
||||
echo "Invalid configuration or input: " . $e->getMessage() . "\n";
|
||||
echo "Details: " . $e->getFile() . " at line " . $e->getLine() . "\n";
|
||||
} catch (OcrException $e) {
|
||||
echo "OCR failed: " . $e->getMessage() . "\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Extraction failed: " . $e->getMessage() . "\n";
|
||||
} catch (\RuntimeException $e) {
|
||||
echo "File system error: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
$files = ['doc1.pdf', 'corrupted.pdf', 'doc3.docx'];
|
||||
$successfulExtractions = [];
|
||||
$failedExtractions = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
try {
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
$successfulExtractions[$file] = $result;
|
||||
echo "Success: $file\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
$failedExtractions[$file] = [
|
||||
'error' => $e->getMessage(),
|
||||
'type' => get_class($e),
|
||||
];
|
||||
echo "Failed: $file - " . $e->getMessage() . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nResults:\n";
|
||||
echo "Successful: " . count($successfulExtractions) . "\n";
|
||||
echo "Failed: " . count($failedExtractions) . "\n";
|
||||
|
||||
function extractWithRetry(
|
||||
Kreuzberg $kreuzberg,
|
||||
string $file,
|
||||
int $maxRetries = 3
|
||||
): ?\Kreuzberg\Result\ExtractionResult {
|
||||
$attempt = 0;
|
||||
|
||||
while ($attempt < $maxRetries) {
|
||||
try {
|
||||
return $kreuzberg->extractFile($file);
|
||||
} catch (OcrException $e) {
|
||||
$attempt++;
|
||||
if ($attempt >= $maxRetries) {
|
||||
echo "OCR failed after $maxRetries attempts: " . $e->getMessage() . "\n";
|
||||
return null;
|
||||
}
|
||||
echo "OCR attempt $attempt failed, retrying...\n";
|
||||
sleep(1);
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Fatal error (no retry): " . $e->getMessage() . "\n";
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$result = extractWithRetry($kreuzberg, 'difficult_scan.pdf');
|
||||
if ($result !== null) {
|
||||
echo "Successfully extracted with retry: " . strlen($result->content) . " chars\n";
|
||||
}
|
||||
```
|
||||
160
docs/snippets/php/utils/error_handling_extract.php
Normal file
160
docs/snippets/php/utils/error_handling_extract.php
Normal file
@@ -0,0 +1,160 @@
|
||||
```php title="error_handling_extract.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Error Handling for HTTP/API Extraction
|
||||
*
|
||||
* Demonstrate error handling when using Kreuzberg extraction via HTTP API.
|
||||
* Shows how to properly handle HTTP errors and API response errors.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use GuzzleHttp\Exception\ClientException;
|
||||
use GuzzleHttp\Exception\ServerException;
|
||||
|
||||
/**
|
||||
* Extract document via HTTP API with error handling
|
||||
*
|
||||
* @param string $filePath Path to the document file
|
||||
* @param string $apiUrl API endpoint URL
|
||||
* @return array|null Extraction results or null on error
|
||||
*/
|
||||
function extractViaApi(string $filePath, string $apiUrl = 'http://localhost:8000/extract'): ?array
|
||||
{
|
||||
$client = new Client([
|
||||
'timeout' => 30.0,
|
||||
'connect_timeout' => 5.0,
|
||||
]);
|
||||
|
||||
try {
|
||||
if (!file_exists($filePath)) {
|
||||
throw new \RuntimeException("File not found: $filePath");
|
||||
}
|
||||
|
||||
$response = $client->post($apiUrl, [
|
||||
'multipart' => [
|
||||
[
|
||||
'name' => 'files',
|
||||
'contents' => fopen($filePath, 'r'),
|
||||
'filename' => basename($filePath),
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$results = json_decode($response->getBody()->getContents(), true);
|
||||
|
||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||
throw new \RuntimeException('Invalid JSON response: ' . json_last_error_msg());
|
||||
}
|
||||
|
||||
echo "Success: Extracted " . count($results) . " documents\n";
|
||||
return $results;
|
||||
|
||||
} catch (ClientException $e) {
|
||||
$response = $e->getResponse();
|
||||
$statusCode = $response->getStatusCode();
|
||||
$body = json_decode($response->getBody()->getContents(), true);
|
||||
|
||||
$errorType = $body['error_type'] ?? 'Unknown';
|
||||
$message = $body['message'] ?? 'No message provided';
|
||||
|
||||
echo "Client Error ($statusCode): $errorType\n";
|
||||
echo "Message: $message\n";
|
||||
|
||||
if (isset($body['details'])) {
|
||||
echo "Details: " . json_encode($body['details']) . "\n";
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
} catch (ServerException $e) {
|
||||
$response = $e->getResponse();
|
||||
$statusCode = $response->getStatusCode();
|
||||
|
||||
echo "Server Error ($statusCode): " . $e->getMessage() . "\n";
|
||||
echo "The API server encountered an error. Please try again later.\n";
|
||||
|
||||
return null;
|
||||
|
||||
} catch (RequestException $e) {
|
||||
echo "Request Error: " . $e->getMessage() . "\n";
|
||||
|
||||
if ($e->hasResponse()) {
|
||||
echo "Response code: " . $e->getResponse()->getStatusCode() . "\n";
|
||||
} else {
|
||||
echo "No response received - check if the API server is running\n";
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
} catch (\RuntimeException $e) {
|
||||
echo "Runtime Error: " . $e->getMessage() . "\n";
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
echo "Attempting to extract document via API...\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$result = extractViaApi('document.pdf');
|
||||
|
||||
if ($result !== null) {
|
||||
foreach ($result as $doc) {
|
||||
$contentLength = strlen($doc['content'] ?? '');
|
||||
$mimeType = $doc['mime_type'] ?? 'unknown';
|
||||
|
||||
echo "\nDocument extracted:\n";
|
||||
echo " Content length: $contentLength characters\n";
|
||||
echo " MIME type: $mimeType\n";
|
||||
|
||||
if (isset($doc['metadata'])) {
|
||||
echo " Metadata keys: " . implode(', ', array_keys($doc['metadata'])) . "\n";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
echo "\nExtraction failed. Check the error messages above.\n";
|
||||
}
|
||||
|
||||
function extractWithRetry(
|
||||
string $filePath,
|
||||
string $apiUrl = 'http://localhost:8000/extract',
|
||||
int $maxRetries = 3,
|
||||
float $initialDelay = 1.0
|
||||
): ?array {
|
||||
$attempt = 0;
|
||||
$delay = $initialDelay;
|
||||
|
||||
while ($attempt < $maxRetries) {
|
||||
$result = extractViaApi($filePath, $apiUrl);
|
||||
|
||||
if ($result !== null) {
|
||||
return $result;
|
||||
}
|
||||
|
||||
$attempt++;
|
||||
if ($attempt < $maxRetries) {
|
||||
echo "\nRetrying in " . number_format($delay, 1) . " seconds... (Attempt " . ($attempt + 1) . "/$maxRetries)\n";
|
||||
usleep((int)($delay * 1000000));
|
||||
$delay *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nFailed after $maxRetries attempts\n";
|
||||
return null;
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('=', 60) . "\n";
|
||||
echo "Extracting with retry logic...\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$resultWithRetry = extractWithRetry('document.pdf', 'http://localhost:8000/extract');
|
||||
|
||||
if ($resultWithRetry !== null) {
|
||||
echo "\nSuccessfully extracted with retry mechanism\n";
|
||||
}
|
||||
```
|
||||
134
docs/snippets/php/utils/image_extraction.php
Normal file
134
docs/snippets/php/utils/image_extraction.php
Normal file
@@ -0,0 +1,134 @@
|
||||
```php title="image_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Extraction from Documents
|
||||
*
|
||||
* Extract embedded images from PDF and other document formats.
|
||||
* Demonstrates saving images, analyzing metadata, and processing image data.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
use Kreuzberg\Result\ExtractedImage;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 90
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document_with_images.pdf');
|
||||
|
||||
echo "Image Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total images extracted: " . count($result->images ?? []) . "\n\n";
|
||||
|
||||
$outputDir = './extracted_images';
|
||||
if (!is_dir($outputDir)) {
|
||||
mkdir($outputDir, 0755, true);
|
||||
}
|
||||
|
||||
foreach ($result->images ?? [] as $index => $image) {
|
||||
echo "Image " . ($index + 1) . ":\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$filename = sprintf(
|
||||
'page_%d_image_%d.%s',
|
||||
$image->pageNumber ?? 0,
|
||||
$image->imageIndex ?? $index,
|
||||
$image->format ?? 'png'
|
||||
);
|
||||
$filepath = $outputDir . '/' . $filename;
|
||||
|
||||
$bytesWritten = file_put_contents($filepath, $image->data);
|
||||
|
||||
if ($bytesWritten !== false) {
|
||||
echo " Saved: $filename\n";
|
||||
echo " Size: {$image->width}x{$image->height} pixels\n";
|
||||
echo " Format: {$image->format}\n";
|
||||
echo " File size: " . number_format($bytesWritten) . " bytes\n";
|
||||
echo " Page: " . ($image->pageNumber ?? 'N/A') . "\n";
|
||||
|
||||
if ($image->width > 0 && $image->height > 0) {
|
||||
$aspectRatio = $image->width / $image->height;
|
||||
echo " Aspect ratio: " . number_format($aspectRatio, 2) . ":1\n";
|
||||
|
||||
$orientation = $image->width > $image->height ? 'Landscape' : 'Portrait';
|
||||
if (abs($image->width - $image->height) < 10) {
|
||||
$orientation = 'Square';
|
||||
}
|
||||
echo " Orientation: $orientation\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
} else {
|
||||
echo " Error: Failed to save image\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Image Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
if (!empty($result->images)) {
|
||||
$largeImages = array_filter(
|
||||
$result->images,
|
||||
fn(ExtractedImage $img) => $img->width > 800 || $img->height > 800
|
||||
);
|
||||
|
||||
echo "Large images (>800px): " . count($largeImages) . "\n";
|
||||
|
||||
$totalBytes = array_sum(
|
||||
array_map(fn(ExtractedImage $img) => strlen($img->data), $result->images)
|
||||
);
|
||||
|
||||
echo "Total image data: " . number_format($totalBytes / 1024, 2) . " KB\n";
|
||||
|
||||
$formatCounts = [];
|
||||
foreach ($result->images as $image) {
|
||||
$format = $image->format ?? 'unknown';
|
||||
$formatCounts[$format] = ($formatCounts[$format] ?? 0) + 1;
|
||||
}
|
||||
|
||||
echo "\nImages by format:\n";
|
||||
foreach ($formatCounts as $format => $count) {
|
||||
echo " $format: $count\n";
|
||||
}
|
||||
|
||||
$totalWidth = array_sum(array_map(fn($img) => $img->width, $result->images));
|
||||
$totalHeight = array_sum(array_map(fn($img) => $img->height, $result->images));
|
||||
$imageCount = count($result->images);
|
||||
|
||||
echo "\nAverage dimensions: " .
|
||||
round($totalWidth / $imageCount) . "x" .
|
||||
round($totalHeight / $imageCount) . " pixels\n";
|
||||
}
|
||||
|
||||
function createThumbnail(ExtractedImage $image, int $maxWidth = 200): ?string
|
||||
{
|
||||
if ($image->width <= $maxWidth) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$scale = $maxWidth / $image->width;
|
||||
$newHeight = (int)($image->height * $scale);
|
||||
|
||||
return "Thumbnail would be: {$maxWidth}x{$newHeight}";
|
||||
}
|
||||
|
||||
echo "\nThumbnail recommendations:\n";
|
||||
foreach ($result->images ?? [] as $index => $image) {
|
||||
$thumbInfo = createThumbnail($image, 200);
|
||||
if ($thumbInfo !== null) {
|
||||
echo " Image " . ($index + 1) . ": $thumbInfo\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
187
docs/snippets/php/utils/image_preprocessing.php
Normal file
187
docs/snippets/php/utils/image_preprocessing.php
Normal file
@@ -0,0 +1,187 @@
|
||||
```php title="image_preprocessing.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Preprocessing for OCR
|
||||
*
|
||||
* Configure image preprocessing settings to improve OCR accuracy on scanned documents.
|
||||
* Demonstrates various preprocessing techniques like denoising, deskewing, and contrast enhancement.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
tesseractConfig: new TesseractConfig(
|
||||
preprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'otsu'
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
|
||||
echo "OCR with Image Preprocessing:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content extracted: " . strlen($result->content) . " characters\n";
|
||||
echo "Preview: " . substr($result->content, 0, 100) . "...\n\n";
|
||||
|
||||
$advancedConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
preprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'adaptive',
|
||||
sharpen: true,
|
||||
removeBackground: true
|
||||
),
|
||||
pageSegmentationMode: 3,
|
||||
engineMode: 3
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($advancedConfig);
|
||||
$result = $kreuzberg->extractFile('poor_quality_scan.pdf');
|
||||
|
||||
echo "Advanced Preprocessing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
|
||||
if (isset($result->metadata)) {
|
||||
$qualityScore = $result->qualityScore ?? null;
|
||||
$confidence = $result->metadata['ocr_confidence'] ?? null;
|
||||
|
||||
if ($qualityScore !== null) {
|
||||
echo "Quality score: " . number_format($qualityScore, 2) . "\n";
|
||||
|
||||
if ($qualityScore < 0.5) {
|
||||
echo "Warning: Low quality extraction detected\n";
|
||||
echo "Recommendations:\n";
|
||||
echo " - Increase target DPI (current: 600)\n";
|
||||
echo " - Try different binarization method\n";
|
||||
echo " - Consider rescanning the original document\n";
|
||||
}
|
||||
}
|
||||
|
||||
if ($confidence !== null) {
|
||||
echo "OCR confidence: " . number_format($confidence * 100, 1) . "%\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
$preprocessingProfiles = [
|
||||
'basic' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: false,
|
||||
deskew: false,
|
||||
contrastEnhance: false
|
||||
),
|
||||
'balanced' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'otsu'
|
||||
),
|
||||
'aggressive' => new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'adaptive',
|
||||
sharpen: true,
|
||||
removeBackground: true
|
||||
),
|
||||
];
|
||||
|
||||
echo "Preprocessing Profile Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($preprocessingProfiles as $profileName => $preprocessing) {
|
||||
$profileConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
tesseractConfig: new TesseractConfig(
|
||||
preprocessing: $preprocessing
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($profileConfig);
|
||||
|
||||
$startTime = microtime(true);
|
||||
$result = $kreuzberg->extractFile('sample_scan.pdf');
|
||||
$elapsedTime = microtime(true) - $startTime;
|
||||
|
||||
echo ucfirst($profileName) . " profile:\n";
|
||||
echo " Content length: " . strlen($result->content) . " characters\n";
|
||||
echo " Processing time: " . number_format($elapsedTime, 3) . " seconds\n";
|
||||
echo " Settings:\n";
|
||||
echo " - DPI: {$preprocessing->targetDpi}\n";
|
||||
echo " - Denoise: " . ($preprocessing->denoise ? 'Yes' : 'No') . "\n";
|
||||
echo " - Deskew: " . ($preprocessing->deskew ? 'Yes' : 'No') . "\n";
|
||||
echo " - Binarization: " . ($preprocessing->binarizationMethod ?? 'None') . "\n";
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
function recommendPreprocessingSettings(string $documentType): ImagePreprocessingConfig
|
||||
{
|
||||
return match ($documentType) {
|
||||
'modern_scan' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: false,
|
||||
binarizationMethod: 'otsu'
|
||||
),
|
||||
'old_document' => new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'adaptive',
|
||||
removeBackground: true
|
||||
),
|
||||
'newspaper' => new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'sauvola',
|
||||
removeBackground: true
|
||||
),
|
||||
default => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true,
|
||||
deskew: true,
|
||||
contrastEnhance: true,
|
||||
binarizationMethod: 'otsu'
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
echo "Recommended preprocessing for old documents:\n";
|
||||
$recommended = recommendPreprocessingSettings('old_document');
|
||||
echo " Target DPI: {$recommended->targetDpi}\n";
|
||||
echo " Binarization: {$recommended->binarizationMethod}\n";
|
||||
```
|
||||
200
docs/snippets/php/utils/keyword_extraction_example.php
Normal file
200
docs/snippets/php/utils/keyword_extraction_example.php
Normal file
@@ -0,0 +1,200 @@
|
||||
```php title="keyword_extraction_example.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Keyword Extraction Example
|
||||
*
|
||||
* Extract keywords from documents using various algorithms.
|
||||
* Demonstrates automatic keyword detection for document analysis and indexing.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\KeywordConfig;
|
||||
use Kreuzberg\Enums\KeywordAlgorithm;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: KeywordAlgorithm::YAKE,
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('research_paper.pdf');
|
||||
|
||||
echo "Keyword Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: research_paper.pdf\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
$keywords = $result->metadata['keywords'] ?? [];
|
||||
|
||||
if (!empty($keywords)) {
|
||||
echo "Extracted Keywords:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
foreach ($keywords as $keyword) {
|
||||
$text = $keyword['text'] ?? '';
|
||||
$score = $keyword['score'] ?? 0.0;
|
||||
$frequency = $keyword['frequency'] ?? null;
|
||||
|
||||
echo sprintf(" %-30s Score: %.3f", $text, $score);
|
||||
|
||||
if ($frequency !== null) {
|
||||
echo sprintf(" (appears %d times)", $frequency);
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
echo "\n";
|
||||
} else {
|
||||
echo "No keywords extracted. Try adjusting minScore or maxKeywords.\n\n";
|
||||
}
|
||||
|
||||
$algorithms = [
|
||||
'YAKE' => KeywordAlgorithm::YAKE,
|
||||
'TextRank' => KeywordAlgorithm::TEXT_RANK,
|
||||
'TF-IDF' => KeywordAlgorithm::TF_IDF,
|
||||
];
|
||||
|
||||
echo "Algorithm Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($algorithms as $name => $algorithm) {
|
||||
$algoConfig = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: $algorithm,
|
||||
maxKeywords: 5,
|
||||
minScore: 0.2
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($algoConfig);
|
||||
$result = $kreuzberg->extractFile('article.pdf');
|
||||
|
||||
$keywords = $result->metadata['keywords'] ?? [];
|
||||
|
||||
echo "$name algorithm:\n";
|
||||
|
||||
if (!empty($keywords)) {
|
||||
foreach ($keywords as $keyword) {
|
||||
echo " - {$keyword['text']} ({$keyword['score']})\n";
|
||||
}
|
||||
} else {
|
||||
echo " No keywords extracted\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
function categorizeDocument(array $keywords): string
|
||||
{
|
||||
$categories = [
|
||||
'technical' => ['algorithm', 'system', 'implementation', 'performance', 'architecture'],
|
||||
'business' => ['revenue', 'market', 'customer', 'strategy', 'investment'],
|
||||
'scientific' => ['research', 'study', 'analysis', 'experiment', 'hypothesis'],
|
||||
'legal' => ['contract', 'agreement', 'liability', 'clause', 'provision'],
|
||||
];
|
||||
|
||||
$scores = [];
|
||||
foreach ($categories as $category => $terms) {
|
||||
$scores[$category] = 0;
|
||||
|
||||
foreach ($keywords as $keyword) {
|
||||
$keywordText = strtolower($keyword['text'] ?? '');
|
||||
$keywordScore = $keyword['score'] ?? 0.0;
|
||||
|
||||
foreach ($terms as $term) {
|
||||
if (str_contains($keywordText, $term)) {
|
||||
$scores[$category] += $keywordScore;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
arsort($scores);
|
||||
$topCategory = array_key_first($scores);
|
||||
|
||||
return $topCategory ?? 'uncategorized';
|
||||
}
|
||||
|
||||
if (!empty($keywords)) {
|
||||
$category = categorizeDocument($keywords);
|
||||
echo "Document Category: " . ucfirst($category) . "\n\n";
|
||||
}
|
||||
|
||||
$documents = [
|
||||
'tech_article.pdf',
|
||||
'business_report.pdf',
|
||||
'research_paper.pdf',
|
||||
];
|
||||
|
||||
$keywordConfig = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: KeywordAlgorithm::YAKE,
|
||||
maxKeywords: 8,
|
||||
minScore: 0.25
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($keywordConfig);
|
||||
|
||||
echo "Batch Keyword Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo "$document: File not found\n\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$keywords = $result->metadata['keywords'] ?? [];
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
|
||||
if (!empty($keywords)) {
|
||||
$topKeywords = array_slice($keywords, 0, 5);
|
||||
$keywordTexts = array_column($topKeywords, 'text');
|
||||
echo " Top keywords: " . implode(', ', $keywordTexts) . "\n";
|
||||
|
||||
$category = categorizeDocument($keywords);
|
||||
echo " Category: " . ucfirst($category) . "\n";
|
||||
} else {
|
||||
echo " No keywords extracted\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$keywordIndex = [];
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$keywords = $result->metadata['keywords'] ?? [];
|
||||
|
||||
foreach ($keywords as $keyword) {
|
||||
$text = strtolower($keyword['text'] ?? '');
|
||||
if (!isset($keywordIndex[$text])) {
|
||||
$keywordIndex[$text] = [];
|
||||
}
|
||||
$keywordIndex[$text][] = basename($document);
|
||||
}
|
||||
}
|
||||
|
||||
echo "Keyword Index (for search):\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach (array_slice($keywordIndex, 0, 10) as $keyword => $docs) {
|
||||
echo "$keyword: " . implode(', ', array_unique($docs)) . "\n";
|
||||
}
|
||||
```
|
||||
197
docs/snippets/php/utils/language_detection.php
Normal file
197
docs/snippets/php/utils/language_detection.php
Normal file
@@ -0,0 +1,197 @@
|
||||
```php title="language_detection.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Language Detection
|
||||
*
|
||||
* Automatically detect the language of extracted document content.
|
||||
* Useful for routing documents to language-specific processing pipelines.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.9,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Language Detection Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: document.pdf\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
$detectedLanguages = $result->detectedLanguages ?? [];
|
||||
|
||||
if (!empty($detectedLanguages)) {
|
||||
echo "Detected languages: " . implode(', ', $detectedLanguages) . "\n";
|
||||
|
||||
$primaryLanguage = $detectedLanguages[0];
|
||||
echo "Primary language: $primaryLanguage\n\n";
|
||||
|
||||
if (isset($result->metadata['language_confidence'])) {
|
||||
echo "Language confidence scores:\n";
|
||||
foreach ($result->metadata['language_confidence'] as $lang => $confidence) {
|
||||
echo sprintf(" %-10s: %.1f%%\n", $lang, $confidence * 100);
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
} else {
|
||||
echo "No language detected or confidence too low.\n";
|
||||
echo "Try lowering minConfidence threshold.\n\n";
|
||||
}
|
||||
|
||||
if (!empty($detectedLanguages)) {
|
||||
$primaryLanguage = $detectedLanguages[0];
|
||||
|
||||
match ($primaryLanguage) {
|
||||
'en', 'eng' => print("Processing as English document...\n"),
|
||||
'es', 'spa' => print("Processing as Spanish document...\n"),
|
||||
'fr', 'fra' => print("Processing as French document...\n"),
|
||||
'de', 'deu' => print("Processing as German document...\n"),
|
||||
'zh', 'zho' => print("Processing as Chinese document...\n"),
|
||||
default => print("Processing as $primaryLanguage document...\n"),
|
||||
};
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('=', 60) . "\n";
|
||||
echo "Testing Different Confidence Thresholds:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$thresholds = [0.5, 0.7, 0.9, 0.95];
|
||||
|
||||
foreach ($thresholds as $threshold) {
|
||||
$thresholdConfig = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: $threshold,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($thresholdConfig);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
$languages = $result->detectedLanguages ?? [];
|
||||
|
||||
echo sprintf("Threshold %.2f: ", $threshold);
|
||||
if (!empty($languages)) {
|
||||
echo implode(', ', $languages) . "\n";
|
||||
} else {
|
||||
echo "No languages detected\n";
|
||||
}
|
||||
}
|
||||
|
||||
function getLanguageName(string $code): string
|
||||
{
|
||||
$languageNames = [
|
||||
'en' => 'English',
|
||||
'es' => 'Spanish',
|
||||
'fr' => 'French',
|
||||
'de' => 'German',
|
||||
'it' => 'Italian',
|
||||
'pt' => 'Portuguese',
|
||||
'ru' => 'Russian',
|
||||
'zh' => 'Chinese',
|
||||
'ja' => 'Japanese',
|
||||
'ko' => 'Korean',
|
||||
'ar' => 'Arabic',
|
||||
'hi' => 'Hindi',
|
||||
'nl' => 'Dutch',
|
||||
'pl' => 'Polish',
|
||||
'tr' => 'Turkish',
|
||||
];
|
||||
|
||||
return $languageNames[$code] ?? ucfirst($code);
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('=', 60) . "\n";
|
||||
echo "Detected Languages (Full Names):\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
if (!empty($detectedLanguages)) {
|
||||
foreach ($detectedLanguages as $langCode) {
|
||||
echo " - " . getLanguageName($langCode) . " ($langCode)\n";
|
||||
}
|
||||
} else {
|
||||
echo "No languages detected.\n";
|
||||
}
|
||||
|
||||
$documents = [
|
||||
'english_doc.pdf',
|
||||
'spanish_doc.pdf',
|
||||
'german_doc.pdf',
|
||||
];
|
||||
|
||||
echo "\n" . str_repeat('=', 60) . "\n";
|
||||
echo "Batch Language Detection:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$detectionConfig = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($detectionConfig);
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$languages = $result->detectedLanguages ?? [];
|
||||
|
||||
echo basename($document) . ": ";
|
||||
|
||||
if (!empty($languages)) {
|
||||
$primaryLang = $languages[0];
|
||||
echo getLanguageName($primaryLang) . " ($primaryLang)\n";
|
||||
} else {
|
||||
echo "Language not detected\n";
|
||||
}
|
||||
}
|
||||
|
||||
function routeDocumentByLanguage(string $filePath, array $detectedLanguages): string
|
||||
{
|
||||
if (empty($detectedLanguages)) {
|
||||
return 'default_queue';
|
||||
}
|
||||
|
||||
$primaryLanguage = $detectedLanguages[0];
|
||||
|
||||
return match ($primaryLanguage) {
|
||||
'en', 'eng' => 'english_processing_queue',
|
||||
'es', 'spa' => 'spanish_processing_queue',
|
||||
'fr', 'fra' => 'french_processing_queue',
|
||||
'de', 'deu' => 'german_processing_queue',
|
||||
'zh', 'zho', 'ja', 'jpn', 'ko', 'kor' => 'cjk_processing_queue',
|
||||
'ar', 'ara', 'he', 'heb' => 'rtl_processing_queue',
|
||||
default => 'multilingual_queue',
|
||||
};
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('=', 60) . "\n";
|
||||
echo "Document Routing Based on Language:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
if (!empty($detectedLanguages)) {
|
||||
$queue = routeDocumentByLanguage('document.pdf', $detectedLanguages);
|
||||
echo "Document routed to: $queue\n";
|
||||
}
|
||||
```
|
||||
236
docs/snippets/php/utils/language_detection_multilingual.php
Normal file
236
docs/snippets/php/utils/language_detection_multilingual.php
Normal file
@@ -0,0 +1,236 @@
|
||||
```php title="language_detection_multilingual.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Multilingual Document Language Detection
|
||||
*
|
||||
* Detect multiple languages in documents that contain mixed-language content.
|
||||
* Useful for processing multilingual documents, translations, and international content.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.7,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('multilingual_document.pdf');
|
||||
|
||||
echo "Multilingual Language Detection:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: multilingual_document.pdf\n\n";
|
||||
|
||||
$detectedLanguages = $result->detectedLanguages ?? [];
|
||||
$languageCount = count($detectedLanguages);
|
||||
|
||||
echo "Detected $languageCount language(s): " . implode(', ', $detectedLanguages) . "\n\n";
|
||||
|
||||
if ($languageCount > 1) {
|
||||
echo "This is a multilingual document.\n";
|
||||
echo "Languages present:\n";
|
||||
|
||||
foreach ($detectedLanguages as $index => $language) {
|
||||
$label = $index === 0 ? 'Primary' : 'Secondary';
|
||||
echo " $label: $language\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
} elseif ($languageCount === 1) {
|
||||
echo "This is a monolingual document.\n";
|
||||
echo "Language: {$detectedLanguages[0]}\n\n";
|
||||
} else {
|
||||
echo "No languages detected.\n\n";
|
||||
}
|
||||
|
||||
if (isset($result->metadata['language_distribution'])) {
|
||||
echo "Language Distribution:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
foreach ($result->metadata['language_distribution'] as $lang => $percentage) {
|
||||
$barLength = (int)($percentage * 40);
|
||||
$bar = str_repeat('█', $barLength);
|
||||
|
||||
echo sprintf(
|
||||
" %-10s [%-40s] %5.1f%%\n",
|
||||
$lang,
|
||||
$bar,
|
||||
$percentage * 100
|
||||
);
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
function categorizeMultilingualDocument(array $languages): string
|
||||
{
|
||||
$count = count($languages);
|
||||
|
||||
if ($count === 0) {
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
if ($count === 1) {
|
||||
return 'monolingual';
|
||||
}
|
||||
|
||||
if ($count === 2) {
|
||||
sort($languages);
|
||||
$pair = implode('-', $languages);
|
||||
|
||||
$commonPairs = [
|
||||
'en-es' => 'English-Spanish bilingual',
|
||||
'en-fr' => 'English-French bilingual',
|
||||
'en-de' => 'English-German bilingual',
|
||||
'en-zh' => 'English-Chinese bilingual',
|
||||
];
|
||||
|
||||
return $commonPairs[$pair] ?? 'bilingual';
|
||||
}
|
||||
|
||||
return 'multilingual';
|
||||
}
|
||||
|
||||
$docType = categorizeMultilingualDocument($detectedLanguages);
|
||||
echo "Document type: $docType\n\n";
|
||||
|
||||
if ($languageCount > 1) {
|
||||
echo "Multilingual Processing Recommendations:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
echo "1. Consider splitting content by language\n";
|
||||
echo "2. Use language-specific OCR models if available\n";
|
||||
echo "3. Apply appropriate tokenization for each language\n";
|
||||
echo "4. Use multilingual embedding models for semantic search\n\n";
|
||||
}
|
||||
|
||||
function extractLanguageSections(string $content, array $languages): array
|
||||
{
|
||||
|
||||
$sections = [];
|
||||
$lines = explode("\n", $content);
|
||||
$currentLang = $languages[0] ?? 'unknown';
|
||||
|
||||
foreach ($lines as $line) {
|
||||
if (empty(trim($line))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isset($sections[$currentLang])) {
|
||||
$sections[$currentLang] = [];
|
||||
}
|
||||
|
||||
$sections[$currentLang][] = $line;
|
||||
}
|
||||
|
||||
return $sections;
|
||||
}
|
||||
|
||||
$testDocuments = [
|
||||
'english_only.pdf',
|
||||
'spanish_english.pdf',
|
||||
'multilingual_eu.pdf',
|
||||
];
|
||||
|
||||
echo "Batch Multilingual Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$multilingualConfig = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.6,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($multilingualConfig);
|
||||
|
||||
$statistics = [
|
||||
'monolingual' => 0,
|
||||
'bilingual' => 0,
|
||||
'multilingual' => 0,
|
||||
];
|
||||
|
||||
foreach ($testDocuments as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$languages = $result->detectedLanguages ?? [];
|
||||
$type = categorizeMultilingualDocument($languages);
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
echo " Languages: " . implode(', ', $languages) . "\n";
|
||||
echo " Type: $type\n\n";
|
||||
|
||||
if (count($languages) === 1) {
|
||||
$statistics['monolingual']++;
|
||||
} elseif (count($languages) === 2) {
|
||||
$statistics['bilingual']++;
|
||||
} elseif (count($languages) > 2) {
|
||||
$statistics['multilingual']++;
|
||||
}
|
||||
}
|
||||
|
||||
echo "Statistics:\n";
|
||||
echo " Monolingual: {$statistics['monolingual']}\n";
|
||||
echo " Bilingual: {$statistics['bilingual']}\n";
|
||||
echo " Multilingual: {$statistics['multilingual']}\n\n";
|
||||
|
||||
function analyzeLanguagePairs(array $documents, Kreuzberg $kreuzberg): array
|
||||
{
|
||||
$pairs = [];
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$languages = $result->detectedLanguages ?? [];
|
||||
|
||||
if (count($languages) >= 2) {
|
||||
sort($languages);
|
||||
$pair = implode('-', array_slice($languages, 0, 2));
|
||||
|
||||
if (!isset($pairs[$pair])) {
|
||||
$pairs[$pair] = 0;
|
||||
}
|
||||
|
||||
$pairs[$pair]++;
|
||||
}
|
||||
}
|
||||
|
||||
arsort($pairs);
|
||||
return $pairs;
|
||||
}
|
||||
|
||||
$translationPairs = [
|
||||
'en-es' => 'English ↔ Spanish',
|
||||
'en-fr' => 'English ↔ French',
|
||||
'en-de' => 'English ↔ German',
|
||||
'en-zh' => 'English ↔ Chinese',
|
||||
'en-ja' => 'English ↔ Japanese',
|
||||
];
|
||||
|
||||
echo "Common Translation Pairs:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($translationPairs as $code => $name) {
|
||||
echo " $code: $name\n";
|
||||
}
|
||||
|
||||
echo "\nUse these configurations for translation document processing.\n";
|
||||
```
|
||||
203
docs/snippets/php/utils/quality_processing_example.php
Normal file
203
docs/snippets/php/utils/quality_processing_example.php
Normal file
@@ -0,0 +1,203 @@
|
||||
```php title="quality_processing_example.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Quality Processing Example
|
||||
*
|
||||
* Enable quality processing to assess and improve extraction quality.
|
||||
* Useful for detecting low-quality scans and suggesting improvements.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
|
||||
echo "Quality Processing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: scanned_document.pdf\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
$qualityScore = $result->qualityScore ?? null;
|
||||
|
||||
if ($qualityScore !== null) {
|
||||
echo "Quality Score: " . number_format($qualityScore, 2) . "\n";
|
||||
echo "Rating: ";
|
||||
|
||||
if ($qualityScore >= 0.8) {
|
||||
echo "Excellent\n";
|
||||
echo "Status: ✓ Ready for production use\n";
|
||||
} elseif ($qualityScore >= 0.6) {
|
||||
echo "Good\n";
|
||||
echo "Status: ✓ Acceptable quality\n";
|
||||
} elseif ($qualityScore >= 0.5) {
|
||||
echo "Fair\n";
|
||||
echo "Status: ⚠ May require review\n";
|
||||
} else {
|
||||
echo "Poor\n";
|
||||
echo "Status: ✗ Requires attention\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
if ($qualityScore < 0.5) {
|
||||
echo "Recommendations for Improvement:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo "1. Re-scan with higher DPI (300+ recommended)\n";
|
||||
echo "2. Ensure original is clean and well-lit\n";
|
||||
echo "3. Adjust OCR preprocessing settings:\n";
|
||||
echo " - Enable denoising\n";
|
||||
echo " - Enable deskewing\n";
|
||||
echo " - Increase contrast enhancement\n";
|
||||
echo "4. Try different binarization methods\n";
|
||||
echo "5. Consider manual review and correction\n\n";
|
||||
}
|
||||
} else {
|
||||
echo "Quality score not available.\n";
|
||||
echo "Enable quality processing in configuration.\n\n";
|
||||
}
|
||||
|
||||
if (isset($result->metadata['ocr_confidence'])) {
|
||||
$ocrConfidence = $result->metadata['ocr_confidence'];
|
||||
echo "OCR Confidence: " . number_format($ocrConfidence * 100, 1) . "%\n\n";
|
||||
|
||||
if ($ocrConfidence < 0.7) {
|
||||
echo "⚠ Low OCR confidence detected.\n";
|
||||
echo "The extracted text may contain errors.\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (isset($result->metadata['quality_metrics'])) {
|
||||
echo "Detailed Quality Metrics:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$metrics = $result->metadata['quality_metrics'];
|
||||
|
||||
foreach ($metrics as $metric => $value) {
|
||||
$formattedValue = is_numeric($value)
|
||||
? number_format($value, 3)
|
||||
: $value;
|
||||
|
||||
echo sprintf(" %-25s: %s\n", ucwords(str_replace('_', ' ', $metric)), $formattedValue);
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$documents = [
|
||||
'high_quality_scan.pdf',
|
||||
'medium_quality_scan.pdf',
|
||||
'low_quality_scan.pdf',
|
||||
];
|
||||
|
||||
echo "Batch Quality Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$qualityConfig = new ExtractionConfig(
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($qualityConfig);
|
||||
$qualityResults = [];
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
$score = $result->qualityScore ?? 0.0;
|
||||
|
||||
$qualityResults[$document] = [
|
||||
'score' => $score,
|
||||
'content_length' => strlen($result->content),
|
||||
'result' => $result,
|
||||
];
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
echo " Quality score: " . number_format($score, 2) . "\n";
|
||||
echo " Content length: " . strlen($result->content) . " chars\n";
|
||||
|
||||
$indicator = match(true) {
|
||||
$score >= 0.8 => '✓ Excellent',
|
||||
$score >= 0.6 => '✓ Good',
|
||||
$score >= 0.5 => '⚠ Fair',
|
||||
default => '✗ Poor',
|
||||
};
|
||||
|
||||
echo " Status: $indicator\n\n";
|
||||
}
|
||||
|
||||
if (!empty($qualityResults)) {
|
||||
$scores = array_column($qualityResults, 'score');
|
||||
$avgScore = array_sum($scores) / count($scores);
|
||||
$maxScore = max($scores);
|
||||
$minScore = min($scores);
|
||||
|
||||
echo "Quality Statistics:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo " Average: " . number_format($avgScore, 2) . "\n";
|
||||
echo " Highest: " . number_format($maxScore, 2) . "\n";
|
||||
echo " Lowest: " . number_format($minScore, 2) . "\n\n";
|
||||
|
||||
$lowQualityDocs = array_filter(
|
||||
$qualityResults,
|
||||
fn($result) => $result['score'] < 0.5
|
||||
);
|
||||
|
||||
if (!empty($lowQualityDocs)) {
|
||||
echo "Documents Requiring Attention:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
foreach ($lowQualityDocs as $doc => $data) {
|
||||
echo " - " . basename($doc) . " (score: " . number_format($data['score'], 2) . ")\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
|
||||
function needsReprocessing(float $qualityScore, int $contentLength): bool
|
||||
{
|
||||
return $qualityScore < 0.5 || $contentLength < 100;
|
||||
}
|
||||
|
||||
function routeDocumentByQuality(string $filePath, float $qualityScore): string
|
||||
{
|
||||
return match(true) {
|
||||
$qualityScore >= 0.8 => 'auto_processing_queue',
|
||||
$qualityScore >= 0.6 => 'standard_review_queue',
|
||||
$qualityScore >= 0.5 => 'detailed_review_queue',
|
||||
default => 'manual_review_queue',
|
||||
};
|
||||
}
|
||||
|
||||
echo "Document Routing Based on Quality:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($qualityResults as $doc => $data) {
|
||||
$queue = routeDocumentByQuality($doc, $data['score']);
|
||||
$reprocess = needsReprocessing($data['score'], $data['content_length']);
|
||||
|
||||
echo basename($doc) . ":\n";
|
||||
echo " Route to: $queue\n";
|
||||
|
||||
if ($reprocess) {
|
||||
echo " Action: Reprocess with enhanced settings\n";
|
||||
} else {
|
||||
echo " Action: Continue standard workflow\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
```
|
||||
20
docs/snippets/php/utils/standalone_embed.md
Normal file
20
docs/snippets/php/utils/standalone_embed.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php
|
||||
<?php
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
use Kreuzberg\Config\EmbeddingModelType;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
// Embed with default config (balanced preset)
|
||||
$embeddings = $kreuzberg->embed(["Hello world", "How are you?"]);
|
||||
|
||||
// Embed with specific preset
|
||||
$config = new EmbeddingConfig(model: EmbeddingModelType::preset("fast"));
|
||||
$embeddings = $kreuzberg->embed(["Hello world"], $config);
|
||||
|
||||
// Each embedding is a float array
|
||||
foreach ($embeddings as $i => $vector) {
|
||||
echo "Text $i: " . count($vector) . " dimensions\n";
|
||||
}
|
||||
```
|
||||
237
docs/snippets/php/utils/tables.php
Normal file
237
docs/snippets/php/utils/tables.php
Normal file
@@ -0,0 +1,237 @@
|
||||
```php title="tables.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Table Extraction and Processing
|
||||
*
|
||||
* Extract tables from documents and convert them to various formats.
|
||||
* Demonstrates table processing, formatting, and export capabilities.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Result\ExtractedTable;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Table Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n\n";
|
||||
|
||||
foreach ($result->tables as $tableIndex => $table) {
|
||||
echo "Table " . ($tableIndex + 1) . ":\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$rowCount = count($table->cells);
|
||||
$colCount = !empty($table->cells) ? count($table->cells[0]) : 0;
|
||||
|
||||
echo " Dimensions: $rowCount rows × $colCount columns\n";
|
||||
|
||||
if (isset($table->pageNumber)) {
|
||||
echo " Page: {$table->pageNumber}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo " Markdown representation:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
echo " Raw data preview:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$previewRows = array_slice($table->cells, 0, 3);
|
||||
foreach ($previewRows as $rowIndex => $row) {
|
||||
echo " Row " . ($rowIndex + 1) . ": [" . implode(' | ', $row) . "]\n";
|
||||
}
|
||||
|
||||
if ($rowCount > 3) {
|
||||
echo " ... and " . ($rowCount - 3) . " more rows\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
echo "Exporting Tables to CSV:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$outputDir = './exported_tables';
|
||||
if (!is_dir($outputDir)) {
|
||||
mkdir($outputDir, 0755, true);
|
||||
}
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$filename = sprintf('table_%d.csv', $index + 1);
|
||||
$filepath = $outputDir . '/' . $filename;
|
||||
|
||||
$fp = fopen($filepath, 'w');
|
||||
|
||||
if ($fp !== false) {
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
echo "Saved: $filename\n";
|
||||
} else {
|
||||
echo "Error: Failed to create $filename\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Exporting Tables to JSON:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$filename = sprintf('table_%d.json', $index + 1);
|
||||
$filepath = $outputDir . '/' . $filename;
|
||||
|
||||
$tableData = [
|
||||
'index' => $index + 1,
|
||||
'page' => $table->pageNumber ?? null,
|
||||
'dimensions' => [
|
||||
'rows' => count($table->cells),
|
||||
'columns' => !empty($table->cells) ? count($table->cells[0]) : 0,
|
||||
],
|
||||
'data' => $table->cells,
|
||||
'markdown' => $table->markdown,
|
||||
];
|
||||
|
||||
$json = json_encode($tableData, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
|
||||
file_put_contents($filepath, $json);
|
||||
|
||||
echo "Saved: $filename\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
function tableToHtml(ExtractedTable $table): string
|
||||
{
|
||||
$html = "<table>\n";
|
||||
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
$html .= " <tr>\n";
|
||||
|
||||
$tag = $rowIndex === 0 ? 'th' : 'td';
|
||||
|
||||
foreach ($row as $cell) {
|
||||
$escapedCell = htmlspecialchars($cell, ENT_QUOTES, 'UTF-8');
|
||||
$html .= " <$tag>$escapedCell</$tag>\n";
|
||||
}
|
||||
|
||||
$html .= " </tr>\n";
|
||||
}
|
||||
|
||||
$html .= "</table>";
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
echo "Exporting Tables to HTML:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$filename = sprintf('table_%d.html', $index + 1);
|
||||
$filepath = $outputDir . '/' . $filename;
|
||||
|
||||
$html = "<!DOCTYPE html>\n";
|
||||
$html .= "<html>\n<head>\n";
|
||||
$html .= " <meta charset=\"UTF-8\">\n";
|
||||
$html .= " <title>Table " . ($index + 1) . "</title>\n";
|
||||
$html .= " <style>\n";
|
||||
$html .= " table { border-collapse: collapse; width: 100%; }\n";
|
||||
$html .= " th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }\n";
|
||||
$html .= " th { background-color: #f2f2f2; }\n";
|
||||
$html .= " </style>\n";
|
||||
$html .= "</head>\n<body>\n";
|
||||
$html .= " <h1>Table " . ($index + 1) . "</h1>\n";
|
||||
$html .= tableToHtml($table) . "\n";
|
||||
$html .= "</body>\n</html>";
|
||||
|
||||
file_put_contents($filepath, $html);
|
||||
|
||||
echo "Saved: $filename\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Table Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . " Analysis:\n";
|
||||
|
||||
$cells = $table->cells;
|
||||
$totalCells = array_sum(array_map('count', $cells));
|
||||
$emptyCells = 0;
|
||||
$numericCells = 0;
|
||||
|
||||
foreach ($cells as $row) {
|
||||
foreach ($row as $cell) {
|
||||
if (empty(trim($cell))) {
|
||||
$emptyCells++;
|
||||
}
|
||||
|
||||
if (is_numeric($cell)) {
|
||||
$numericCells++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo " Total cells: $totalCells\n";
|
||||
echo " Empty cells: $emptyCells (" . number_format(($emptyCells / max($totalCells, 1)) * 100, 1) . "%)\n";
|
||||
echo " Numeric cells: $numericCells (" . number_format(($numericCells / max($totalCells, 1)) * 100, 1) . "%)\n";
|
||||
|
||||
$numericRatio = $numericCells / max($totalCells, 1);
|
||||
$tableType = match(true) {
|
||||
$numericRatio > 0.5 => 'Data/Numeric Table',
|
||||
$numericRatio > 0.2 => 'Mixed Content Table',
|
||||
default => 'Text Table',
|
||||
};
|
||||
|
||||
echo " Table type: $tableType\n\n";
|
||||
}
|
||||
|
||||
function tableToAssociativeArray(ExtractedTable $table): array
|
||||
{
|
||||
if (empty($table->cells)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$headers = array_shift($table->cells);
|
||||
$data = [];
|
||||
|
||||
foreach ($table->cells as $row) {
|
||||
$rowData = [];
|
||||
foreach ($headers as $index => $header) {
|
||||
$rowData[$header] = $row[$index] ?? '';
|
||||
}
|
||||
$data[] = $rowData;
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
if (!empty($result->tables)) {
|
||||
$firstTable = $result->tables[0];
|
||||
$associativeData = tableToAssociativeArray($firstTable);
|
||||
|
||||
echo "First Table as Associative Array:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo json_encode(array_slice($associativeData, 0, 3), JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE) . "\n";
|
||||
|
||||
if (count($associativeData) > 3) {
|
||||
echo "... and " . (count($associativeData) - 3) . " more records\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
169
docs/snippets/php/utils/token_reduction.php
Normal file
169
docs/snippets/php/utils/token_reduction.php
Normal file
@@ -0,0 +1,169 @@
|
||||
```php title="token_reduction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Token Reduction Configuration
|
||||
*
|
||||
* Configure token reduction to compress extracted content while preserving meaning.
|
||||
* Useful for reducing token costs in LLM applications and staying within token limits.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\TokenReductionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Token Reduction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
if (isset($result->metadata['original_token_count'])) {
|
||||
$originalTokens = $result->metadata['original_token_count'];
|
||||
$reducedTokens = $result->metadata['token_count'] ?? strlen($result->content);
|
||||
$reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
|
||||
|
||||
echo "Token Reduction Statistics:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo " Original tokens: " . number_format($originalTokens) . "\n";
|
||||
echo " Reduced tokens: " . number_format($reducedTokens) . "\n";
|
||||
echo " Reduction: " . number_format($reductionRatio * 100, 1) . "%\n";
|
||||
echo " Tokens saved: " . number_format($originalTokens - $reducedTokens) . "\n\n";
|
||||
}
|
||||
|
||||
$modes = [
|
||||
'light' => 'Light reduction - minimal changes',
|
||||
'moderate' => 'Moderate reduction - balanced',
|
||||
'aggressive' => 'Aggressive reduction - maximum compression',
|
||||
];
|
||||
|
||||
echo "Token Reduction Mode Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$comparisonResults = [];
|
||||
|
||||
foreach ($modes as $mode => $description) {
|
||||
$modeConfig = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: $mode,
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($modeConfig);
|
||||
$result = $kreuzberg->extractFile('sample.pdf');
|
||||
|
||||
$contentLength = strlen($result->content);
|
||||
$tokenCount = $result->metadata['token_count'] ?? $contentLength;
|
||||
|
||||
$comparisonResults[$mode] = [
|
||||
'length' => $contentLength,
|
||||
'tokens' => $tokenCount,
|
||||
'content' => substr($result->content, 0, 100),
|
||||
];
|
||||
|
||||
echo "$mode mode:\n";
|
||||
echo " Description: $description\n";
|
||||
echo " Content length: " . number_format($contentLength) . " characters\n";
|
||||
echo " Estimated tokens: " . number_format($tokenCount) . "\n";
|
||||
echo " Preview: " . substr($result->content, 0, 80) . "...\n\n";
|
||||
}
|
||||
|
||||
if (count($comparisonResults) > 1) {
|
||||
$lightLength = $comparisonResults['light']['length'] ?? 0;
|
||||
$aggressiveLength = $comparisonResults['aggressive']['length'] ?? 0;
|
||||
|
||||
if ($lightLength > 0) {
|
||||
$savings = (($lightLength - $aggressiveLength) / $lightLength) * 100;
|
||||
|
||||
echo "Comparison Summary:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo "Aggressive vs Light mode saves: " . number_format($savings, 1) . "%\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
$advancedConfig = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
preserveMarkdown: true,
|
||||
preserveNumbers: true,
|
||||
removeStopWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($advancedConfig);
|
||||
$result = $kreuzberg->extractFile('verbose_document.pdf');
|
||||
|
||||
echo "Advanced Token Reduction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Configuration:\n";
|
||||
echo " - Preserve important words: Yes\n";
|
||||
echo " - Preserve markdown: Yes\n";
|
||||
echo " - Preserve numbers: Yes\n";
|
||||
echo " - Remove stop words: Yes\n\n";
|
||||
|
||||
echo "Result:\n";
|
||||
echo " Content length: " . strlen($result->content) . " characters\n";
|
||||
|
||||
if (isset($result->metadata['token_reduction_ratio'])) {
|
||||
echo " Reduction ratio: " . number_format($result->metadata['token_reduction_ratio'] * 100, 1) . "%\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
function estimateTokenCost(int $tokens, float $pricePerMillion = 0.50): float
|
||||
{
|
||||
return ($tokens / 1_000_000) * $pricePerMillion;
|
||||
}
|
||||
|
||||
echo "Cost Estimation (based on reduction):\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($comparisonResults as $mode => $data) {
|
||||
$tokens = $data['tokens'];
|
||||
$cost = estimateTokenCost($tokens);
|
||||
|
||||
echo ucfirst($mode) . " mode:\n";
|
||||
echo " Tokens: " . number_format($tokens) . "\n";
|
||||
echo " Estimated cost: $" . number_format($cost, 4) . "\n\n";
|
||||
}
|
||||
|
||||
function chooseReductionMode(int $maxTokens, int $estimatedTokens): string
|
||||
{
|
||||
$ratio = $estimatedTokens / $maxTokens;
|
||||
|
||||
return match(true) {
|
||||
$ratio <= 1.0 => 'none',
|
||||
$ratio <= 1.3 => 'light',
|
||||
$ratio <= 1.7 => 'moderate',
|
||||
default => 'aggressive',
|
||||
};
|
||||
}
|
||||
|
||||
$maxTokenLimit = 8000;
|
||||
$documentTokens = 12000;
|
||||
|
||||
$recommendedMode = chooseReductionMode($maxTokenLimit, $documentTokens);
|
||||
|
||||
echo "Reduction Mode Recommendation:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document tokens: " . number_format($documentTokens) . "\n";
|
||||
echo "Token limit: " . number_format($maxTokenLimit) . "\n";
|
||||
echo "Recommended mode: $recommendedMode\n";
|
||||
echo "Reason: " . ($documentTokens > $maxTokenLimit
|
||||
? "Document exceeds limit by " . number_format($documentTokens - $maxTokenLimit) . " tokens"
|
||||
: "Document within limits") . "\n";
|
||||
```
|
||||
222
docs/snippets/php/utils/token_reduction_example.php
Normal file
222
docs/snippets/php/utils/token_reduction_example.php
Normal file
@@ -0,0 +1,222 @@
|
||||
```php title="token_reduction_example.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Token Reduction Example
|
||||
*
|
||||
* Practical example of using token reduction to fit documents within token limits.
|
||||
* Demonstrates tracking reduction statistics and optimizing for LLM usage.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\TokenReductionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: 'moderate',
|
||||
preserveMarkdown: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('verbose_document.pdf');
|
||||
|
||||
echo "Token Reduction Example:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: verbose_document.pdf\n\n";
|
||||
|
||||
if (isset($result->metadata['original_token_count'])) {
|
||||
$originalTokens = $result->metadata['original_token_count'];
|
||||
$reducedTokens = $result->metadata['token_count'];
|
||||
$reductionRatio = $result->metadata['token_reduction_ratio'];
|
||||
|
||||
echo "Token Reduction Statistics:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo sprintf(" Before: %s tokens\n", number_format($originalTokens));
|
||||
echo sprintf(" After: %s tokens\n", number_format($reducedTokens));
|
||||
echo sprintf(" Reduction: %.1f%%\n", $reductionRatio * 100);
|
||||
echo sprintf(" Saved: %s tokens\n\n", number_format($originalTokens - $reducedTokens));
|
||||
|
||||
$beforeBar = str_repeat('█', (int)($originalTokens / 100));
|
||||
$afterBar = str_repeat('█', (int)($reducedTokens / 100));
|
||||
|
||||
echo "Visual comparison (each █ = ~100 tokens):\n";
|
||||
echo " Before: $beforeBar\n";
|
||||
echo " After: $afterBar\n\n";
|
||||
}
|
||||
|
||||
echo "Content Analysis:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo " Content length: " . strlen($result->content) . " characters\n";
|
||||
echo " First 200 chars: " . substr($result->content, 0, 200) . "...\n\n";
|
||||
|
||||
$documents = [
|
||||
'long_article.pdf',
|
||||
'research_paper.pdf',
|
||||
'technical_doc.pdf',
|
||||
];
|
||||
|
||||
echo "Batch Token Reduction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$batchConfig = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true,
|
||||
preserveMarkdown: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($batchConfig);
|
||||
$totalOriginal = 0;
|
||||
$totalReduced = 0;
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
|
||||
$originalTokens = $result->metadata['original_token_count'] ?? 0;
|
||||
$reducedTokens = $result->metadata['token_count'] ?? 0;
|
||||
$reductionRatio = $result->metadata['token_reduction_ratio'] ?? 0;
|
||||
|
||||
$totalOriginal += $originalTokens;
|
||||
$totalReduced += $reducedTokens;
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
echo sprintf(" Original: %s tokens\n", number_format($originalTokens));
|
||||
echo sprintf(" Reduced: %s tokens\n", number_format($reducedTokens));
|
||||
echo sprintf(" Saved: %.1f%%\n\n", $reductionRatio * 100);
|
||||
}
|
||||
|
||||
if ($totalOriginal > 0) {
|
||||
$overallReduction = (($totalOriginal - $totalReduced) / $totalOriginal) * 100;
|
||||
|
||||
echo "Overall Statistics:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
echo sprintf(" Total original: %s tokens\n", number_format($totalOriginal));
|
||||
echo sprintf(" Total reduced: %s tokens\n", number_format($totalReduced));
|
||||
echo sprintf(" Overall saving: %.1f%%\n\n", $overallReduction);
|
||||
}
|
||||
|
||||
function fitWithinTokenLimit(
|
||||
string $filePath,
|
||||
int $maxTokens,
|
||||
Kreuzberg $kreuzberg
|
||||
): ?array {
|
||||
$modes = ['light', 'moderate', 'aggressive'];
|
||||
|
||||
foreach ($modes as $mode) {
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: $mode,
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzbergWithMode = new Kreuzberg($config);
|
||||
$result = $kreuzbergWithMode->extractFile($filePath);
|
||||
|
||||
$tokens = $result->metadata['token_count'] ?? strlen($result->content);
|
||||
|
||||
if ($tokens <= $maxTokens) {
|
||||
return [
|
||||
'mode' => $mode,
|
||||
'tokens' => $tokens,
|
||||
'result' => $result,
|
||||
'fits' => true,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionConfig(
|
||||
mode: 'aggressive',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzbergWithMode = new Kreuzberg($config);
|
||||
$result = $kreuzbergWithMode->extractFile($filePath);
|
||||
$tokens = $result->metadata['token_count'] ?? strlen($result->content);
|
||||
|
||||
return [
|
||||
'mode' => 'aggressive',
|
||||
'tokens' => $tokens,
|
||||
'result' => $result,
|
||||
'fits' => false,
|
||||
];
|
||||
}
|
||||
|
||||
echo "Fitting Document to Token Limit:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$tokenLimit = 8000;
|
||||
$testFile = 'large_document.pdf';
|
||||
|
||||
if (file_exists($testFile)) {
|
||||
$fitResult = fitWithinTokenLimit($testFile, $tokenLimit, $kreuzberg);
|
||||
|
||||
echo "Target limit: " . number_format($tokenLimit) . " tokens\n";
|
||||
echo "Reduction mode used: {$fitResult['mode']}\n";
|
||||
echo "Final token count: " . number_format($fitResult['tokens']) . "\n";
|
||||
|
||||
if ($fitResult['fits']) {
|
||||
echo "Status: ✓ Successfully fits within limit\n";
|
||||
$remaining = $tokenLimit - $fitResult['tokens'];
|
||||
echo "Tokens remaining: " . number_format($remaining) . "\n";
|
||||
} else {
|
||||
echo "Status: ✗ Still exceeds limit\n";
|
||||
$excess = $fitResult['tokens'] - $tokenLimit;
|
||||
echo "Tokens over limit: " . number_format($excess) . "\n";
|
||||
echo "Suggestion: Consider chunking the document\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
function calculateCostSavings(
|
||||
int $originalTokens,
|
||||
int $reducedTokens,
|
||||
float $pricePerMillion = 0.50
|
||||
): array {
|
||||
$originalCost = ($originalTokens / 1_000_000) * $pricePerMillion;
|
||||
$reducedCost = ($reducedTokens / 1_000_000) * $pricePerMillion;
|
||||
$savings = $originalCost - $reducedCost;
|
||||
$savingsPercent = ($savings / max($originalCost, 0.000001)) * 100;
|
||||
|
||||
return [
|
||||
'original_cost' => $originalCost,
|
||||
'reduced_cost' => $reducedCost,
|
||||
'savings' => $savings,
|
||||
'savings_percent' => $savingsPercent,
|
||||
];
|
||||
}
|
||||
|
||||
if ($totalOriginal > 0 && $totalReduced > 0) {
|
||||
$savings = calculateCostSavings($totalOriginal, $totalReduced);
|
||||
|
||||
echo "Cost Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Price: $0.50 per million tokens (example)\n\n";
|
||||
echo sprintf(" Original cost: $%.6f\n", $savings['original_cost']);
|
||||
echo sprintf(" Reduced cost: $%.6f\n", $savings['reduced_cost']);
|
||||
echo sprintf(" Savings: $%.6f (%.1f%%)\n\n", $savings['savings'], $savings['savings_percent']);
|
||||
|
||||
$documentsPerDay = 100;
|
||||
$daysPerYear = 365;
|
||||
$annualSavings = $savings['savings'] * $documentsPerDay * $daysPerYear;
|
||||
|
||||
echo "Projected Annual Savings:\n";
|
||||
echo " Documents per day: $documentsPerDay\n";
|
||||
echo " Annual savings: $" . number_format($annualSavings, 2) . "\n";
|
||||
}
|
||||
```
|
||||
284
docs/snippets/php/utils/vector_database_integration.php
Normal file
284
docs/snippets/php/utils/vector_database_integration.php
Normal file
@@ -0,0 +1,284 @@
|
||||
```php title="vector_database_integration.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Vector Database Integration
|
||||
*
|
||||
* Extract documents with chunking and embeddings for vector database storage.
|
||||
* Demonstrates preparing data for semantic search and RAG applications.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Vector Database Integration:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Document: document.pdf\n";
|
||||
echo "Total chunks: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
$vectorRecords = [];
|
||||
|
||||
foreach ($result->chunks ?? [] as $index => $chunk) {
|
||||
if ($chunk->embedding === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = sprintf(
|
||||
'doc_%s_chunk_%d',
|
||||
md5('document.pdf'),
|
||||
$index
|
||||
);
|
||||
|
||||
$vectorRecords[] = [
|
||||
'id' => $chunkId,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'source_file' => 'document.pdf',
|
||||
'chunk_index' => $index,
|
||||
'chunk_length' => strlen($chunk->content),
|
||||
'embedding_model' => 'balanced',
|
||||
'created_at' => date('c'),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
echo "Prepared " . count($vectorRecords) . " records for vector database\n\n";
|
||||
|
||||
if (!empty($vectorRecords)) {
|
||||
echo "Sample Vector Record Structure:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$sample = $vectorRecords[0];
|
||||
echo "ID: {$sample['id']}\n";
|
||||
echo "Content preview: " . substr($sample['content'], 0, 100) . "...\n";
|
||||
echo "Embedding dimensions: " . count($sample['embedding']) . "\n";
|
||||
echo "Metadata keys: " . implode(', ', array_keys($sample['metadata'])) . "\n\n";
|
||||
}
|
||||
|
||||
function insertIntoPinecone(array $records, string $namespace = 'default'): void
|
||||
{
|
||||
|
||||
echo "Inserting into Pinecone:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$batches = array_chunk($records, 100);
|
||||
|
||||
foreach ($batches as $batchIndex => $batch) {
|
||||
echo sprintf(
|
||||
"Batch %d: Upserting %d vectors to namespace '%s'...\n",
|
||||
$batchIndex + 1,
|
||||
count($batch),
|
||||
$namespace
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
echo "Completed inserting " . count($records) . " vectors\n\n";
|
||||
}
|
||||
|
||||
function insertIntoWeaviate(array $records, string $className = 'Document'): void
|
||||
{
|
||||
|
||||
echo "Inserting into Weaviate:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
foreach ($records as $index => $record) {
|
||||
$object = [
|
||||
'class' => $className,
|
||||
'properties' => [
|
||||
'content' => $record['content'],
|
||||
'sourceFile' => $record['metadata']['source_file'],
|
||||
'chunkIndex' => $record['metadata']['chunk_index'],
|
||||
'createdAt' => $record['metadata']['created_at'],
|
||||
],
|
||||
'vector' => $record['embedding'],
|
||||
];
|
||||
|
||||
|
||||
if (($index + 1) % 10 === 0) {
|
||||
echo sprintf("Inserted %d/%d objects\n", $index + 1, count($records));
|
||||
}
|
||||
}
|
||||
|
||||
echo "Completed inserting " . count($records) . " objects\n\n";
|
||||
}
|
||||
|
||||
function insertIntoQdrant(
|
||||
array $records,
|
||||
string $collectionName = 'documents'
|
||||
): void {
|
||||
|
||||
echo "Inserting into Qdrant:\n";
|
||||
echo str_repeat('-', 40) . "\n";
|
||||
|
||||
$points = [];
|
||||
|
||||
foreach ($records as $record) {
|
||||
$points[] = [
|
||||
'id' => $record['id'],
|
||||
'vector' => $record['embedding'],
|
||||
'payload' => [
|
||||
'content' => $record['content'],
|
||||
'metadata' => $record['metadata'],
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
echo sprintf(
|
||||
"Upserting %d points to collection '%s'...\n",
|
||||
count($points),
|
||||
$collectionName
|
||||
);
|
||||
|
||||
|
||||
echo "Completed\n\n";
|
||||
}
|
||||
|
||||
echo "Vector Database Integration Examples:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
insertIntoPinecone($vectorRecords, 'documents');
|
||||
|
||||
insertIntoWeaviate($vectorRecords, 'DocumentChunk');
|
||||
|
||||
insertIntoQdrant($vectorRecords, 'document_chunks');
|
||||
|
||||
$documents = [
|
||||
'doc1.pdf',
|
||||
'doc2.pdf',
|
||||
'doc3.pdf',
|
||||
];
|
||||
|
||||
echo "Batch Processing for Vector Database:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$allVectorRecords = [];
|
||||
|
||||
$vectorConfig = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChars: 512,
|
||||
maxOverlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
normalize: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($vectorConfig);
|
||||
|
||||
foreach ($documents as $document) {
|
||||
if (!file_exists($document)) {
|
||||
echo basename($document) . ": File not found\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile($document);
|
||||
|
||||
echo basename($document) . ":\n";
|
||||
echo " Chunks: " . count($result->chunks ?? []) . "\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $index => $chunk) {
|
||||
if ($chunk->embedding === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$chunkId = sprintf(
|
||||
'doc_%s_chunk_%d',
|
||||
md5($document),
|
||||
$index
|
||||
);
|
||||
|
||||
$allVectorRecords[] = [
|
||||
'id' => $chunkId,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'source_file' => basename($document),
|
||||
'chunk_index' => $index,
|
||||
'chunk_length' => strlen($chunk->content),
|
||||
'embedding_model' => 'balanced',
|
||||
'created_at' => date('c'),
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nTotal records prepared: " . count($allVectorRecords) . "\n\n";
|
||||
|
||||
function simulateSemanticSearch(string $query, array $records, int $topK = 5): array
|
||||
{
|
||||
|
||||
echo "Simulating semantic search:\n";
|
||||
echo " Query: \"$query\"\n";
|
||||
echo " Searching " . count($records) . " vectors...\n";
|
||||
echo " Top $topK results:\n\n";
|
||||
|
||||
|
||||
$results = array_slice($records, 0, $topK);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo sprintf(
|
||||
" %d. %s (score: %.3f)\n",
|
||||
$index + 1,
|
||||
substr($result['content'], 0, 60) . '...',
|
||||
0.9 - ($index * 0.05)
|
||||
);
|
||||
echo sprintf(" Source: %s\n", $result['metadata']['source_file']);
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
if (!empty($allVectorRecords)) {
|
||||
echo "Semantic Search Example:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
simulateSemanticSearch(
|
||||
"How to configure document extraction?",
|
||||
$allVectorRecords,
|
||||
3
|
||||
);
|
||||
}
|
||||
|
||||
function exportVectorRecordsToJson(array $records, string $filename): void
|
||||
{
|
||||
$data = [
|
||||
'version' => '1.0',
|
||||
'count' => count($records),
|
||||
'generated_at' => date('c'),
|
||||
'records' => $records,
|
||||
];
|
||||
|
||||
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE);
|
||||
file_put_contents($filename, $json);
|
||||
|
||||
echo "Exported " . count($records) . " vector records to: $filename\n";
|
||||
}
|
||||
|
||||
if (!empty($allVectorRecords)) {
|
||||
exportVectorRecordsToJson($allVectorRecords, 'vector_records.json');
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user