Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

229
docs/snippets/php/cache/disk_cache.php vendored Normal file
View File

@@ -0,0 +1,229 @@
```php title="disk_cache.php"
<?php
declare(strict_types=1);
/**
* Disk Cache for Document Extraction
*
* Implement file-based caching to avoid re-processing the same documents.
* Significantly improves performance for repeated extractions.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Types\ExtractionResult;
class DiskCache
{
private string $cacheDir;
private int $ttl;
public function __construct(string $cacheDir = null, int $ttl = 7 * 86400)
{
$this->cacheDir = $cacheDir ?? sys_get_temp_dir() . '/kreuzberg_cache';
$this->ttl = $ttl;
if (!is_dir($this->cacheDir)) {
mkdir($this->cacheDir, 0755, true);
}
}
private function getCacheKey(string $filePath, ExtractionConfig $config): string
{
$fileHash = md5_file($filePath);
$configHash = md5(json_encode($config->toArray()));
return md5($filePath . $fileHash . $configHash);
}
private function getCachePath(string $key): string
{
return $this->cacheDir . '/' . $key . '.cache';
}
public function get(string $filePath, ExtractionConfig $config): ?ExtractionResult
{
$key = $this->getCacheKey($filePath, $config);
$cachePath = $this->getCachePath($key);
if (!file_exists($cachePath)) {
return null;
}
if (time() - filemtime($cachePath) > $this->ttl) {
unlink($cachePath);
return null;
}
$data = file_get_contents($cachePath);
if ($data === false) {
return null;
}
$cached = unserialize($data);
if ($cached instanceof ExtractionResult) {
return $cached;
}
return null;
}
public function set(string $filePath, ExtractionConfig $config, ExtractionResult $result): void
{
$key = $this->getCacheKey($filePath, $config);
$cachePath = $this->getCachePath($key);
file_put_contents($cachePath, serialize($result));
}
public function clear(): void
{
$files = glob($this->cacheDir . '/*.cache');
foreach ($files as $file) {
unlink($file);
}
}
public function getStats(): array
{
$files = glob($this->cacheDir . '/*.cache');
$totalSize = 0;
foreach ($files as $file) {
$totalSize += filesize($file);
}
return [
'total_entries' => count($files),
'cache_size_bytes' => $totalSize,
'cache_dir' => $this->cacheDir,
];
}
}
$cache = new DiskCache();
$kreuzberg = new Kreuzberg();
$config = new ExtractionConfig();
$file = 'document.pdf';
echo "First extraction (will be cached)...\n";
$start = microtime(true);
$result = $cache->get($file, $config);
if ($result === null) {
$result = $kreuzberg->extractFile($file, config: $config);
$cache->set($file, $config, $result);
echo " Status: Extracted and cached\n";
} else {
echo " Status: Retrieved from cache\n";
}
$elapsed = microtime(true) - $start;
echo " Time: " . number_format($elapsed, 4) . "s\n";
echo " Content length: " . strlen($result->content) . " chars\n\n";
echo "Second extraction (from cache)...\n";
$start = microtime(true);
$result = $cache->get($file, $config);
if ($result === null) {
$result = $kreuzberg->extractFile($file, config: $config);
$cache->set($file, $config, $result);
echo " Status: Extracted and cached\n";
} else {
echo " Status: Retrieved from cache\n";
}
$elapsed = microtime(true) - $start;
echo " Time: " . number_format($elapsed, 4) . "s\n";
echo " Content length: " . strlen($result->content) . " chars\n\n";
$stats = $cache->getStats();
echo "Cache Statistics:\n";
echo str_repeat('=', 60) . "\n";
echo "Total entries: {$stats['total_entries']}\n";
echo "Cache size: " . number_format($stats['cache_size_bytes'] / 1024 / 1024, 2) . " MB\n";
echo "Cache directory: {$stats['cache_dir']}\n\n";
class CachedKreuzberg
{
public function __construct(
private Kreuzberg $kreuzberg,
private DiskCache $cache
) {}
public function extractFile(
string $filePath,
?string $mimeType = null,
?ExtractionConfig $config = null
): ExtractionResult {
$config = $config ?? new ExtractionConfig();
$result = $this->cache->get($filePath, $config);
if ($result === null) {
$result = $this->kreuzberg->extractFile($filePath, $mimeType, $config);
$this->cache->set($filePath, $config, $result);
}
return $result;
}
public function clearCache(): void
{
$this->cache->clear();
}
public function getCacheStats(): array
{
return $this->cache->getStats();
}
}
$cachedKreuzberg = new CachedKreuzberg(
new Kreuzberg(),
new DiskCache()
);
echo "Using CachedKreuzberg wrapper:\n";
echo str_repeat('=', 60) . "\n";
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
foreach ($files as $file) {
if (!file_exists($file)) continue;
$start = microtime(true);
$result = $cachedKreuzberg->extractFile($file);
$elapsed = microtime(true) - $start;
echo "$file: " . number_format($elapsed, 4) . "s\n";
}
echo "\nCache stats:\n";
$stats = $cachedKreuzberg->getCacheStats();
print_r($stats);
function cleanupCache(DiskCache $cache, int $maxAge = 7 * 86400): int
{
$cacheDir = $cache->getStats()['cache_dir'];
$files = glob($cacheDir . '/*.cache');
$deleted = 0;
foreach ($files as $file) {
if (time() - filemtime($file) > $maxAge) {
unlink($file);
$deleted++;
}
}
return $deleted;
}
$deleted = cleanupCache($cache, 7 * 86400);
echo "\nCleaned up $deleted old cache entries\n";
```