This commit is contained in:
41
docs/snippets/php/advanced/chunk_page_mapping.md
Normal file
41
docs/snippets/php/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50
|
||||
),
|
||||
pages: new PageConfig(
|
||||
extractPages: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata) {
|
||||
$firstPage = $metadata->getFirstPage();
|
||||
$lastPage = $metadata->getLastPage();
|
||||
|
||||
if ($firstPage !== null && $lastPage !== null) {
|
||||
if ($firstPage === $lastPage) {
|
||||
$pageRange = "Page " . $firstPage;
|
||||
} else {
|
||||
$pageRange = "Pages " . $firstPage . "-" . $lastPage;
|
||||
}
|
||||
echo "Chunk: " . substr($chunk->getContent(), 0, 50) . "... (" . $pageRange . ")\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
79
docs/snippets/php/advanced/chunking_config.md
Normal file
79
docs/snippets/php/advanced/chunking_config.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
// Basic chunking
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Number of chunks: " . count($result->getChunks()) . "\n";
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Semantic Chunking"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'semantic',
|
||||
topicThreshold: 0.75
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Chunks with topic-based boundaries: " . count($result->getChunks()) . "\n";
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Prepend Heading Context"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'markdown',
|
||||
prependHeadingContext: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.md', null, $config);
|
||||
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata && $metadata->getHeadingContext()) {
|
||||
$headings = $metadata->getHeadingContext()->getHeadings();
|
||||
foreach ($headings as $heading) {
|
||||
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
|
||||
}
|
||||
}
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
39
docs/snippets/php/advanced/chunking_rag.md
Normal file
39
docs/snippets/php/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata) {
|
||||
echo "Chunk " . ($metadata->getChunkIndex() + 1) . "/" . $metadata->getTotalChunks() . "\n";
|
||||
echo "Position: " . $metadata->getByteStart() . "-" . $metadata->getByteEnd() . "\n";
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
|
||||
if ($chunk->getEmbedding()) {
|
||||
echo "Embedding: " . count($chunk->getEmbedding()) . " dimensions\n";
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
39
docs/snippets/php/advanced/embedding_with_chunking.md
Normal file
39
docs/snippets/php/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1024,
|
||||
overlap: 100,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
|
||||
$embedding = $chunk->getEmbedding();
|
||||
if ($embedding) {
|
||||
echo "Embedding dimension: " . count($embedding) . "\n";
|
||||
echo "First 5 values: ";
|
||||
echo implode(", ", array_slice($embedding, 0, 5));
|
||||
echo "\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
313
docs/snippets/php/advanced/error_handling.php
Normal file
313
docs/snippets/php/advanced/error_handling.php
Normal file
@@ -0,0 +1,313 @@
|
||||
```php title="error_handling.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Error Handling
|
||||
*
|
||||
* Robust error handling for document extraction operations.
|
||||
* Handle failures gracefully and implement retry strategies.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
try {
|
||||
$result = extract_file('document.pdf');
|
||||
echo "Extraction successful!\n";
|
||||
echo "Content length: " . strlen($result->content) . "\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Error: " . $e->getMessage() . "\n";
|
||||
echo "Code: " . $e->getCode() . "\n";
|
||||
error_log("Kreuzberg extraction failed: " . $e->getMessage());
|
||||
}
|
||||
|
||||
function safeExtract(string $filePath): ?string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
error_log("File not found: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!is_readable($filePath)) {
|
||||
error_log("File not readable: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$content = safeExtract('document.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted document\n";
|
||||
} else {
|
||||
echo "Failed to extract document\n";
|
||||
}
|
||||
|
||||
function extractWithRetry(
|
||||
string $filePath,
|
||||
int $maxRetries = 3,
|
||||
int $initialDelay = 1000
|
||||
): ?string {
|
||||
$attempt = 0;
|
||||
$delay = $initialDelay;
|
||||
|
||||
while ($attempt < $maxRetries) {
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
$attempt++;
|
||||
if ($attempt >= $maxRetries) {
|
||||
error_log("Max retries exceeded for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
echo "Attempt $attempt failed, retrying in {$delay}ms...\n";
|
||||
usleep($delay * 1000);
|
||||
$delay *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithRetry('potentially_corrupt.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Document extracted after retry\n";
|
||||
}
|
||||
|
||||
function validateExtractionResult(string $filePath): bool
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
error_log("Empty content extracted from $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
$minExpectedChars = 100;
|
||||
if (strlen($result->content) < $minExpectedChars) {
|
||||
error_log("Content too short from $filePath: " . strlen($result->content) . " chars");
|
||||
return false;
|
||||
}
|
||||
|
||||
$nonPrintableRatio = (strlen($result->content) - strlen(preg_replace('/[^\x20-\x7E\x0A\x0D]/', '', $result->content))) / strlen($result->content);
|
||||
if ($nonPrintableRatio > 0.5) {
|
||||
error_log("High non-printable character ratio in $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Validation failed for $filePath: " . $e->getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (validateExtractionResult('document.pdf')) {
|
||||
echo "Extraction result validated successfully\n";
|
||||
} else {
|
||||
echo "Extraction result validation failed\n";
|
||||
}
|
||||
|
||||
$files = glob('documents/*.pdf');
|
||||
$successful = [];
|
||||
$failed = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
try {
|
||||
$result = extract_file($file);
|
||||
$successful[] = [
|
||||
'file' => $file,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
];
|
||||
} catch (KreuzbergException $e) {
|
||||
$failed[] = [
|
||||
'file' => $file,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nBatch Processing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Successful: " . count($successful) . "\n";
|
||||
echo "Failed: " . count($failed) . "\n\n";
|
||||
|
||||
if (!empty($failed)) {
|
||||
echo "Failed files:\n";
|
||||
foreach ($failed as $failure) {
|
||||
echo " - {$failure['file']}: {$failure['error']}\n";
|
||||
}
|
||||
}
|
||||
|
||||
function extractWithFallback(string $filePath): ?string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
if (!empty($result->content)) {
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Normal extraction failed, trying fallback strategies...\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($filePath);
|
||||
if (!empty($result->content)) {
|
||||
echo "Fallback: OCR extraction succeeded\n";
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "OCR fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$content = file_get_contents($filePath);
|
||||
if (!empty($content)) {
|
||||
echo "Fallback: Reading as plain text\n";
|
||||
return $content;
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
echo "Plain text fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithFallback('problematic_file.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted with fallback\n";
|
||||
}
|
||||
|
||||
function extractWithTimeout(string $filePath, int $timeoutSeconds = 30): ?string
|
||||
{
|
||||
$startTime = time();
|
||||
|
||||
try {
|
||||
set_time_limit($timeoutSeconds);
|
||||
|
||||
$result = extract_file($filePath);
|
||||
$elapsed = time() - $startTime;
|
||||
|
||||
if ($elapsed > $timeoutSeconds) {
|
||||
error_log("Extraction exceeded timeout for $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error: " . $e->getMessage());
|
||||
return null;
|
||||
} finally {
|
||||
set_time_limit(0);
|
||||
}
|
||||
}
|
||||
|
||||
class DocumentExtractionException extends \Exception
|
||||
{
|
||||
public function __construct(
|
||||
string $message,
|
||||
public readonly string $filePath,
|
||||
public readonly ?string $mimeType = null,
|
||||
?\Throwable $previous = null
|
||||
) {
|
||||
parent::__construct($message, 0, $previous);
|
||||
}
|
||||
}
|
||||
|
||||
function extractOrThrow(string $filePath): string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
throw new DocumentExtractionException(
|
||||
"No content extracted",
|
||||
$filePath,
|
||||
$result->mimeType
|
||||
);
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
throw new DocumentExtractionException(
|
||||
"Extraction failed: " . $e->getMessage(),
|
||||
$filePath,
|
||||
previous: $e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$content = extractOrThrow('document.pdf');
|
||||
echo "Content: " . substr($content, 0, 100) . "...\n";
|
||||
} catch (DocumentExtractionException $e) {
|
||||
echo "Failed to extract {$e->filePath}\n";
|
||||
echo "Reason: {$e->getMessage()}\n";
|
||||
if ($e->mimeType) {
|
||||
echo "MIME type: {$e->mimeType}\n";
|
||||
}
|
||||
}
|
||||
|
||||
class LoggingKreuzberg
|
||||
{
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
private \Psr\Log\LoggerInterface $logger
|
||||
) {}
|
||||
|
||||
public function extractFile(string $filePath, ?string $mimeType = null): ?\Kreuzberg\Types\ExtractionResult
|
||||
{
|
||||
$this->logger->info("Starting extraction", ['file' => $filePath]);
|
||||
$startTime = microtime(true);
|
||||
|
||||
try {
|
||||
$result = $this->kreuzberg->extractFile($filePath, $mimeType);
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->info("Extraction successful", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
]);
|
||||
|
||||
return $result;
|
||||
} catch (KreuzbergException $e) {
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->error("Extraction failed", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
]);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/php/advanced/keyword_extraction_config.md
Normal file
26
docs/snippets/php/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
foreach ($result->getKeywords() as $keyword) {
|
||||
echo $keyword . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
29
docs/snippets/php/advanced/keyword_extraction_example.md
Normal file
29
docs/snippets/php/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
echo "Extracted Keywords:\n";
|
||||
foreach ($result->getKeywords() as $index => $keyword) {
|
||||
echo ($index + 1) . ". " . $keyword . "\n";
|
||||
}
|
||||
} else {
|
||||
echo "No keywords extracted.\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
22
docs/snippets/php/advanced/language_detection_config.md
Normal file
22
docs/snippets/php/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Detected language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
@@ -0,0 +1,30 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('multilingual_document.pdf', null, $config);
|
||||
|
||||
echo "Detected languages: ";
|
||||
$languages = $result->getDetectedLanguages();
|
||||
if ($languages) {
|
||||
echo implode(", ", $languages) . "\n";
|
||||
} else {
|
||||
echo "None\n";
|
||||
}
|
||||
|
||||
echo "Primary language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
281
docs/snippets/php/advanced/performance_tuning.php
Normal file
281
docs/snippets/php/advanced/performance_tuning.php
Normal file
@@ -0,0 +1,281 @@
|
||||
```php title="performance_tuning.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Performance Tuning and Optimization
|
||||
*
|
||||
* Optimize document extraction for speed and resource usage.
|
||||
* Tips and techniques for processing large volumes of documents.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
function benchmark(callable $fn, string $label): void
|
||||
{
|
||||
$startTime = microtime(true);
|
||||
$startMemory = memory_get_usage();
|
||||
|
||||
$result = $fn();
|
||||
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
$memoryUsed = memory_get_usage() - $startMemory;
|
||||
|
||||
echo "$label:\n";
|
||||
echo " Time: " . number_format($elapsed, 4) . "s\n";
|
||||
echo " Memory: " . number_format($memoryUsed / 1024 / 1024, 2) . " MB\n";
|
||||
echo " Peak memory: " . number_format(memory_get_peak_usage() / 1024 / 1024, 2) . " MB\n\n";
|
||||
}
|
||||
|
||||
$files = array_filter(
|
||||
['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf', 'doc5.pdf'],
|
||||
'file_exists'
|
||||
);
|
||||
|
||||
if (!empty($files)) {
|
||||
echo "Performance Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
benchmark(function () use ($files) {
|
||||
$results = [];
|
||||
foreach ($files as $file) {
|
||||
$results[] = extract_file($file);
|
||||
}
|
||||
return $results;
|
||||
}, "Single file processing");
|
||||
|
||||
benchmark(function () use ($files) {
|
||||
return batch_extract_files($files);
|
||||
}, "Batch processing (parallel)");
|
||||
}
|
||||
|
||||
$fastConfig = new ExtractionConfig(
|
||||
extractImages: false,
|
||||
extractTables: false,
|
||||
preserveFormatting: false
|
||||
);
|
||||
|
||||
$standardConfig = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
);
|
||||
|
||||
$testFile = 'large_document.pdf';
|
||||
if (file_exists($testFile)) {
|
||||
echo "Configuration Impact:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
benchmark(function () use ($testFile, $fastConfig) {
|
||||
$kreuzberg = new Kreuzberg($fastConfig);
|
||||
return $kreuzberg->extractFile($testFile);
|
||||
}, "Fast config (minimal features)");
|
||||
|
||||
benchmark(function () use ($testFile, $standardConfig) {
|
||||
$kreuzberg = new Kreuzberg($standardConfig);
|
||||
return $kreuzberg->extractFile($testFile);
|
||||
}, "Standard config (all features)");
|
||||
}
|
||||
|
||||
function processLargeDocumentEfficiently(string $filePath): void
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
page: new \Kreuzberg\Config\PageConfig(
|
||||
extractPages: true
|
||||
),
|
||||
extractImages: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($filePath);
|
||||
|
||||
echo "Processing large document page by page:\n";
|
||||
|
||||
foreach ($result->pages ?? [] as $page) {
|
||||
$pageContent = $page->content;
|
||||
|
||||
unset($pageContent);
|
||||
|
||||
echo " Processed page {$page->pageNumber}\n";
|
||||
}
|
||||
|
||||
unset($result);
|
||||
gc_collect_cycles();
|
||||
}
|
||||
|
||||
if (file_exists('huge_document.pdf')) {
|
||||
processLargeDocumentEfficiently('huge_document.pdf');
|
||||
}
|
||||
|
||||
function findOptimalBatchSize(array $files): int
|
||||
{
|
||||
$batchSizes = [1, 5, 10, 20, 50];
|
||||
$results = [];
|
||||
|
||||
foreach ($batchSizes as $size) {
|
||||
$batches = array_chunk($files, $size);
|
||||
$startTime = microtime(true);
|
||||
|
||||
foreach ($batches as $batch) {
|
||||
batch_extract_files($batch);
|
||||
}
|
||||
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
$throughput = count($files) / $elapsed;
|
||||
|
||||
$results[$size] = $throughput;
|
||||
|
||||
echo "Batch size $size: " . number_format($throughput, 2) . " files/sec\n";
|
||||
}
|
||||
|
||||
arsort($results);
|
||||
return array_key_first($results);
|
||||
}
|
||||
|
||||
if (!empty($files) && count($files) >= 5) {
|
||||
echo "\nFinding optimal batch size:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
$optimalSize = findOptimalBatchSize($files);
|
||||
echo "\nOptimal batch size: $optimalSize\n\n";
|
||||
}
|
||||
|
||||
class ResourceMonitor
|
||||
{
|
||||
private float $startTime;
|
||||
private int $startMemory;
|
||||
private array $checkpoints = [];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->startTime = microtime(true);
|
||||
$this->startMemory = memory_get_usage();
|
||||
}
|
||||
|
||||
public function checkpoint(string $label): void
|
||||
{
|
||||
$this->checkpoints[] = [
|
||||
'label' => $label,
|
||||
'time' => microtime(true) - $this->startTime,
|
||||
'memory' => memory_get_usage() - $this->startMemory,
|
||||
'peak' => memory_get_peak_usage(),
|
||||
];
|
||||
}
|
||||
|
||||
public function report(): void
|
||||
{
|
||||
echo "Resource Monitor Report:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($this->checkpoints as $checkpoint) {
|
||||
printf("%-30s | Time: %6.3fs | Mem: %6.2f MB\n",
|
||||
$checkpoint['label'],
|
||||
$checkpoint['time'],
|
||||
$checkpoint['memory'] / 1024 / 1024
|
||||
);
|
||||
}
|
||||
|
||||
echo "\nPeak memory: " . number_format(
|
||||
memory_get_peak_usage() / 1024 / 1024, 2
|
||||
) . " MB\n";
|
||||
}
|
||||
}
|
||||
|
||||
$monitor = new ResourceMonitor();
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$monitor->checkpoint("Kreuzberg initialized");
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
$monitor->checkpoint("Document extracted");
|
||||
|
||||
$words = str_word_count($result->content);
|
||||
$monitor->checkpoint("Word count completed");
|
||||
|
||||
unset($result);
|
||||
gc_collect_cycles();
|
||||
$monitor->checkpoint("Memory freed");
|
||||
|
||||
$monitor->report();
|
||||
|
||||
function processConcurrently(array $files, int $workers = 4): array
|
||||
{
|
||||
$chunks = array_chunk($files, ceil(count($files) / $workers));
|
||||
$results = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$chunkResults = batch_extract_files($chunk);
|
||||
$results = array_merge($results, $chunkResults);
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
class CachedKreuzberg
|
||||
{
|
||||
private array $cache = [];
|
||||
private int $maxCacheSize;
|
||||
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
int $maxCacheSize = 100
|
||||
) {
|
||||
$this->maxCacheSize = $maxCacheSize;
|
||||
}
|
||||
|
||||
public function extractFile(string $filePath): \Kreuzberg\Types\ExtractionResult
|
||||
{
|
||||
$cacheKey = md5($filePath . filemtime($filePath));
|
||||
|
||||
if (isset($this->cache[$cacheKey])) {
|
||||
return $this->cache[$cacheKey];
|
||||
}
|
||||
|
||||
$result = $this->kreuzberg->extractFile($filePath);
|
||||
|
||||
if (count($this->cache) >= $this->maxCacheSize) {
|
||||
array_shift($this->cache);
|
||||
}
|
||||
|
||||
$this->cache[$cacheKey] = $result;
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function clearCache(): void
|
||||
{
|
||||
$this->cache = [];
|
||||
}
|
||||
}
|
||||
|
||||
$cachedKreuzberg = new CachedKreuzberg(new Kreuzberg(), maxCacheSize: 50);
|
||||
|
||||
echo "\nCached extraction performance:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$file = 'document.pdf';
|
||||
if (file_exists($file)) {
|
||||
benchmark(function () use ($cachedKreuzberg, $file) {
|
||||
return $cachedKreuzberg->extractFile($file);
|
||||
}, "First extraction (uncached)");
|
||||
|
||||
benchmark(function () use ($cachedKreuzberg, $file) {
|
||||
return $cachedKreuzberg->extractFile($file);
|
||||
}, "Second extraction (cached)");
|
||||
}
|
||||
|
||||
echo "\nPerformance Tips:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "1. Use batch processing for multiple files\n";
|
||||
echo "2. Disable features you don't need (images, tables, OCR)\n";
|
||||
echo "3. Process pages individually for very large documents\n";
|
||||
echo "4. Use appropriate batch sizes (test to find optimal)\n";
|
||||
echo "5. Implement caching for frequently accessed documents\n";
|
||||
echo "6. Monitor memory usage and clear results when done\n";
|
||||
echo "7. Consider using worker processes for high throughput\n";
|
||||
echo "8. Increase PHP memory_limit for large documents\n";
|
||||
```
|
||||
23
docs/snippets/php/advanced/quality_processing_config.md
Normal file
23
docs/snippets/php/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true,
|
||||
useCache: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getQualityScore() !== null) {
|
||||
echo "Quality score: " . $result->getQualityScore() . "\n";
|
||||
}
|
||||
|
||||
if ($result->getProcessingTime() !== null) {
|
||||
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
27
docs/snippets/php/advanced/quality_processing_example.md
Normal file
27
docs/snippets/php/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned_document.pdf', null, $config);
|
||||
|
||||
if ($result->getQualityScore() !== null) {
|
||||
$score = $result->getQualityScore();
|
||||
if ($score < 0.5) {
|
||||
echo "Warning: Low quality extraction (" . round($score, 2) . ")\n";
|
||||
} else {
|
||||
echo "Quality score: " . round($score, 2) . "\n";
|
||||
}
|
||||
} else {
|
||||
echo "Quality score not available.\n";
|
||||
}
|
||||
|
||||
echo "Extracted text length: " . strlen($result->getContent()) . " characters\n";
|
||||
?>
|
||||
```
|
||||
20
docs/snippets/php/advanced/token_reduction_config.md
Normal file
20
docs/snippets/php/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
26
docs/snippets/php/advanced/token_reduction_example.md
Normal file
26
docs/snippets/php/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('verbose_document.pdf', null, $config);
|
||||
|
||||
if ($result->getTokenCount() !== null) {
|
||||
echo "Original token count: " . $result->getTokenCount() . "\n";
|
||||
}
|
||||
|
||||
// Access the reduced content
|
||||
echo "Reduced content length: " . strlen($result->getContent()) . " characters\n";
|
||||
echo "Content preview: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
70
docs/snippets/php/advanced/vector_database_integration.md
Normal file
70
docs/snippets/php/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
class VectorRecord {
|
||||
public function __construct(
|
||||
public string $id,
|
||||
public string $content,
|
||||
public array $embedding,
|
||||
public array $metadata
|
||||
) {}
|
||||
}
|
||||
|
||||
function extractAndVectorize(
|
||||
string $documentPath,
|
||||
string $documentId
|
||||
): array {
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 512,
|
||||
overlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync($documentPath, null, $config);
|
||||
|
||||
$records = [];
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $index => $chunk) {
|
||||
$embedding = $chunk->getEmbedding();
|
||||
if ($embedding) {
|
||||
$metadata = [
|
||||
'document_id' => $documentId,
|
||||
'chunk_index' => (string)$index,
|
||||
'content_length' => (string)strlen($chunk->getContent()),
|
||||
];
|
||||
|
||||
$records[] = new VectorRecord(
|
||||
id: "{$documentId}_chunk_{$index}",
|
||||
content: $chunk->getContent(),
|
||||
embedding: $embedding,
|
||||
metadata: $metadata
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $records;
|
||||
}
|
||||
|
||||
// Usage
|
||||
$records = extractAndVectorize('research_paper.pdf', 'doc_123');
|
||||
|
||||
foreach ($records as $record) {
|
||||
echo "Vector ID: " . $record->id . "\n";
|
||||
echo "Content length: " . strlen($record->content) . " characters\n";
|
||||
echo "Embedding dimension: " . count($record->embedding) . "\n";
|
||||
echo "---\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
Reference in New Issue
Block a user