```php title="performance_tuning.php" extractFile($testFile); }, "Fast config (minimal features)"); benchmark(function () use ($testFile, $standardConfig) { $kreuzberg = new Kreuzberg($standardConfig); return $kreuzberg->extractFile($testFile); }, "Standard config (all features)"); } function processLargeDocumentEfficiently(string $filePath): void { $config = new ExtractionConfig( page: new \Kreuzberg\Config\PageConfig( extractPages: true ), extractImages: false ); $kreuzberg = new Kreuzberg($config); $result = $kreuzberg->extractFile($filePath); echo "Processing large document page by page:\n"; foreach ($result->pages ?? [] as $page) { $pageContent = $page->content; unset($pageContent); echo " Processed page {$page->pageNumber}\n"; } unset($result); gc_collect_cycles(); } if (file_exists('huge_document.pdf')) { processLargeDocumentEfficiently('huge_document.pdf'); } function findOptimalBatchSize(array $files): int { $batchSizes = [1, 5, 10, 20, 50]; $results = []; foreach ($batchSizes as $size) { $batches = array_chunk($files, $size); $startTime = microtime(true); foreach ($batches as $batch) { batch_extract_files($batch); } $elapsed = microtime(true) - $startTime; $throughput = count($files) / $elapsed; $results[$size] = $throughput; echo "Batch size $size: " . number_format($throughput, 2) . " files/sec\n"; } arsort($results); return array_key_first($results); } if (!empty($files) && count($files) >= 5) { echo "\nFinding optimal batch size:\n"; echo str_repeat('=', 60) . "\n"; $optimalSize = findOptimalBatchSize($files); echo "\nOptimal batch size: $optimalSize\n\n"; } class ResourceMonitor { private float $startTime; private int $startMemory; private array $checkpoints = []; public function __construct() { $this->startTime = microtime(true); $this->startMemory = memory_get_usage(); } public function checkpoint(string $label): void { $this->checkpoints[] = [ 'label' => $label, 'time' => microtime(true) - $this->startTime, 'memory' => memory_get_usage() - $this->startMemory, 'peak' => memory_get_peak_usage(), ]; } public function report(): void { echo "Resource Monitor Report:\n"; echo str_repeat('=', 60) . "\n"; foreach ($this->checkpoints as $checkpoint) { printf("%-30s | Time: %6.3fs | Mem: %6.2f MB\n", $checkpoint['label'], $checkpoint['time'], $checkpoint['memory'] / 1024 / 1024 ); } echo "\nPeak memory: " . number_format( memory_get_peak_usage() / 1024 / 1024, 2 ) . " MB\n"; } } $monitor = new ResourceMonitor(); $kreuzberg = new Kreuzberg(); $monitor->checkpoint("Kreuzberg initialized"); $result = $kreuzberg->extractFile('document.pdf'); $monitor->checkpoint("Document extracted"); $words = str_word_count($result->content); $monitor->checkpoint("Word count completed"); unset($result); gc_collect_cycles(); $monitor->checkpoint("Memory freed"); $monitor->report(); function processConcurrently(array $files, int $workers = 4): array { $chunks = array_chunk($files, ceil(count($files) / $workers)); $results = []; foreach ($chunks as $chunk) { $chunkResults = batch_extract_files($chunk); $results = array_merge($results, $chunkResults); } return $results; } class CachedKreuzberg { private array $cache = []; private int $maxCacheSize; public function __construct( private Kreuzberg $kreuzberg, int $maxCacheSize = 100 ) { $this->maxCacheSize = $maxCacheSize; } public function extractFile(string $filePath): \Kreuzberg\Types\ExtractionResult { $cacheKey = md5($filePath . filemtime($filePath)); if (isset($this->cache[$cacheKey])) { return $this->cache[$cacheKey]; } $result = $this->kreuzberg->extractFile($filePath); if (count($this->cache) >= $this->maxCacheSize) { array_shift($this->cache); } $this->cache[$cacheKey] = $result; return $result; } public function clearCache(): void { $this->cache = []; } } $cachedKreuzberg = new CachedKreuzberg(new Kreuzberg(), maxCacheSize: 50); echo "\nCached extraction performance:\n"; echo str_repeat('=', 60) . "\n"; $file = 'document.pdf'; if (file_exists($file)) { benchmark(function () use ($cachedKreuzberg, $file) { return $cachedKreuzberg->extractFile($file); }, "First extraction (uncached)"); benchmark(function () use ($cachedKreuzberg, $file) { return $cachedKreuzberg->extractFile($file); }, "Second extraction (cached)"); } echo "\nPerformance Tips:\n"; echo str_repeat('=', 60) . "\n"; echo "1. Use batch processing for multiple files\n"; echo "2. Disable features you don't need (images, tables, OCR)\n"; echo "3. Process pages individually for very large documents\n"; echo "4. Use appropriate batch sizes (test to find optimal)\n"; echo "5. Implement caching for frequently accessed documents\n"; echo "6. Monitor memory usage and clear results when done\n"; echo "7. Consider using worker processes for high throughput\n"; echo "8. Increase PHP memory_limit for large documents\n"; ```