Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\PageConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 500,
overlap: 50
),
pages: new PageConfig(
extractPages: true
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
if ($result->getChunks()) {
foreach ($result->getChunks() as $chunk) {
$metadata = $chunk->getMetadata();
if ($metadata) {
$firstPage = $metadata->getFirstPage();
$lastPage = $metadata->getLastPage();
if ($firstPage !== null && $lastPage !== null) {
if ($firstPage === $lastPage) {
$pageRange = "Page " . $firstPage;
} else {
$pageRange = "Pages " . $firstPage . "-" . $lastPage;
}
echo "Chunk: " . substr($chunk->getContent(), 0, 50) . "... (" . $pageRange . ")\n";
}
}
}
}
?>
```

View File

@@ -0,0 +1,79 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
// Basic chunking
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 1000,
overlap: 200
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Number of chunks: " . count($result->getChunks()) . "\n";
foreach ($result->getChunks() as $chunk) {
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
}
?>
```
```php title="PHP - Semantic Chunking"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 500,
overlap: 50,
chunkerType: 'semantic',
topicThreshold: 0.75
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Chunks with topic-based boundaries: " . count($result->getChunks()) . "\n";
?>
```
```php title="PHP - Prepend Heading Context"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 500,
overlap: 50,
chunkerType: 'markdown',
prependHeadingContext: true
)
);
$result = Kreuzberg::extractFileSync('document.md', null, $config);
foreach ($result->getChunks() as $chunk) {
$metadata = $chunk->getMetadata();
if ($metadata && $metadata->getHeadingContext()) {
$headings = $metadata->getHeadingContext()->getHeadings();
foreach ($headings as $heading) {
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
}
}
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
}
?>
```

View File

@@ -0,0 +1,39 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 500,
overlap: 50,
embedding: new EmbeddingConfig(
normalize: true,
batchSize: 32
)
)
);
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
if ($result->getChunks()) {
foreach ($result->getChunks() as $chunk) {
$metadata = $chunk->getMetadata();
if ($metadata) {
echo "Chunk " . ($metadata->getChunkIndex() + 1) . "/" . $metadata->getTotalChunks() . "\n";
echo "Position: " . $metadata->getByteStart() . "-" . $metadata->getByteEnd() . "\n";
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
if ($chunk->getEmbedding()) {
echo "Embedding: " . count($chunk->getEmbedding()) . " dimensions\n";
}
}
echo "\n";
}
}
?>
```

View File

@@ -0,0 +1,39 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 1024,
overlap: 100,
embedding: new EmbeddingConfig(
normalize: true,
batchSize: 32,
showDownloadProgress: false
)
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
if ($result->getChunks()) {
foreach ($result->getChunks() as $chunk) {
echo "Chunk content: " . substr($chunk->getContent(), 0, 100) . "...\n";
$embedding = $chunk->getEmbedding();
if ($embedding) {
echo "Embedding dimension: " . count($embedding) . "\n";
echo "First 5 values: ";
echo implode(", ", array_slice($embedding, 0, 5));
echo "\n";
}
echo "\n";
}
}
?>
```

View File

@@ -0,0 +1,313 @@
```php title="error_handling.php"
<?php
declare(strict_types=1);
/**
* Error Handling
*
* Robust error handling for document extraction operations.
* Handle failures gracefully and implement retry strategies.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Exceptions\KreuzbergException;
use function Kreuzberg\extract_file;
try {
$result = extract_file('document.pdf');
echo "Extraction successful!\n";
echo "Content length: " . strlen($result->content) . "\n";
} catch (KreuzbergException $e) {
echo "Error: " . $e->getMessage() . "\n";
echo "Code: " . $e->getCode() . "\n";
error_log("Kreuzberg extraction failed: " . $e->getMessage());
}
function safeExtract(string $filePath): ?string
{
if (!file_exists($filePath)) {
error_log("File not found: $filePath");
return null;
}
if (!is_readable($filePath)) {
error_log("File not readable: $filePath");
return null;
}
try {
$result = extract_file($filePath);
return $result->content;
} catch (KreuzbergException $e) {
error_log("Extraction error for $filePath: " . $e->getMessage());
return null;
}
}
$content = safeExtract('document.pdf');
if ($content !== null) {
echo "Successfully extracted document\n";
} else {
echo "Failed to extract document\n";
}
function extractWithRetry(
string $filePath,
int $maxRetries = 3,
int $initialDelay = 1000
): ?string {
$attempt = 0;
$delay = $initialDelay;
while ($attempt < $maxRetries) {
try {
$result = extract_file($filePath);
return $result->content;
} catch (KreuzbergException $e) {
$attempt++;
if ($attempt >= $maxRetries) {
error_log("Max retries exceeded for $filePath: " . $e->getMessage());
return null;
}
echo "Attempt $attempt failed, retrying in {$delay}ms...\n";
usleep($delay * 1000);
$delay *= 2;
}
}
return null;
}
$content = extractWithRetry('potentially_corrupt.pdf');
if ($content !== null) {
echo "Document extracted after retry\n";
}
function validateExtractionResult(string $filePath): bool
{
try {
$result = extract_file($filePath);
if (empty($result->content)) {
error_log("Empty content extracted from $filePath");
return false;
}
$minExpectedChars = 100;
if (strlen($result->content) < $minExpectedChars) {
error_log("Content too short from $filePath: " . strlen($result->content) . " chars");
return false;
}
$nonPrintableRatio = (strlen($result->content) - strlen(preg_replace('/[^\x20-\x7E\x0A\x0D]/', '', $result->content))) / strlen($result->content);
if ($nonPrintableRatio > 0.5) {
error_log("High non-printable character ratio in $filePath");
return false;
}
return true;
} catch (KreuzbergException $e) {
error_log("Validation failed for $filePath: " . $e->getMessage());
return false;
}
}
if (validateExtractionResult('document.pdf')) {
echo "Extraction result validated successfully\n";
} else {
echo "Extraction result validation failed\n";
}
$files = glob('documents/*.pdf');
$successful = [];
$failed = [];
foreach ($files as $file) {
try {
$result = extract_file($file);
$successful[] = [
'file' => $file,
'content_length' => strlen($result->content),
'tables' => count($result->tables),
];
} catch (KreuzbergException $e) {
$failed[] = [
'file' => $file,
'error' => $e->getMessage(),
'code' => $e->getCode(),
];
}
}
echo "\nBatch Processing Results:\n";
echo str_repeat('=', 60) . "\n";
echo "Successful: " . count($successful) . "\n";
echo "Failed: " . count($failed) . "\n\n";
if (!empty($failed)) {
echo "Failed files:\n";
foreach ($failed as $failure) {
echo " - {$failure['file']}: {$failure['error']}\n";
}
}
function extractWithFallback(string $filePath): ?string
{
try {
$result = extract_file($filePath);
if (!empty($result->content)) {
return $result->content;
}
} catch (KreuzbergException $e) {
echo "Normal extraction failed, trying fallback strategies...\n";
}
try {
$config = new ExtractionConfig(
ocr: new \Kreuzberg\Config\OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($filePath);
if (!empty($result->content)) {
echo "Fallback: OCR extraction succeeded\n";
return $result->content;
}
} catch (KreuzbergException $e) {
echo "OCR fallback failed: " . $e->getMessage() . "\n";
}
try {
$content = file_get_contents($filePath);
if (!empty($content)) {
echo "Fallback: Reading as plain text\n";
return $content;
}
} catch (\Exception $e) {
echo "Plain text fallback failed: " . $e->getMessage() . "\n";
}
return null;
}
$content = extractWithFallback('problematic_file.pdf');
if ($content !== null) {
echo "Successfully extracted with fallback\n";
}
function extractWithTimeout(string $filePath, int $timeoutSeconds = 30): ?string
{
$startTime = time();
try {
set_time_limit($timeoutSeconds);
$result = extract_file($filePath);
$elapsed = time() - $startTime;
if ($elapsed > $timeoutSeconds) {
error_log("Extraction exceeded timeout for $filePath");
return null;
}
return $result->content;
} catch (KreuzbergException $e) {
error_log("Extraction error: " . $e->getMessage());
return null;
} finally {
set_time_limit(0);
}
}
class DocumentExtractionException extends \Exception
{
public function __construct(
string $message,
public readonly string $filePath,
public readonly ?string $mimeType = null,
?\Throwable $previous = null
) {
parent::__construct($message, 0, $previous);
}
}
function extractOrThrow(string $filePath): string
{
try {
$result = extract_file($filePath);
if (empty($result->content)) {
throw new DocumentExtractionException(
"No content extracted",
$filePath,
$result->mimeType
);
}
return $result->content;
} catch (KreuzbergException $e) {
throw new DocumentExtractionException(
"Extraction failed: " . $e->getMessage(),
$filePath,
previous: $e
);
}
}
try {
$content = extractOrThrow('document.pdf');
echo "Content: " . substr($content, 0, 100) . "...\n";
} catch (DocumentExtractionException $e) {
echo "Failed to extract {$e->filePath}\n";
echo "Reason: {$e->getMessage()}\n";
if ($e->mimeType) {
echo "MIME type: {$e->mimeType}\n";
}
}
class LoggingKreuzberg
{
public function __construct(
private Kreuzberg $kreuzberg,
private \Psr\Log\LoggerInterface $logger
) {}
public function extractFile(string $filePath, ?string $mimeType = null): ?\Kreuzberg\Types\ExtractionResult
{
$this->logger->info("Starting extraction", ['file' => $filePath]);
$startTime = microtime(true);
try {
$result = $this->kreuzberg->extractFile($filePath, $mimeType);
$elapsed = microtime(true) - $startTime;
$this->logger->info("Extraction successful", [
'file' => $filePath,
'duration' => $elapsed,
'content_length' => strlen($result->content),
'tables' => count($result->tables),
]);
return $result;
} catch (KreuzbergException $e) {
$elapsed = microtime(true) - $startTime;
$this->logger->error("Extraction failed", [
'file' => $filePath,
'duration' => $elapsed,
'error' => $e->getMessage(),
'code' => $e->getCode(),
]);
return null;
}
}
}
```

View File

@@ -0,0 +1,26 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\KeywordConfig;
$config = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: 'yake',
maxKeywords: 10,
minScore: 0.1,
language: 'en'
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
if ($result->getKeywords()) {
foreach ($result->getKeywords() as $keyword) {
echo $keyword . "\n";
}
}
?>
```

View File

@@ -0,0 +1,29 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\KeywordConfig;
$config = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: 'yake',
maxKeywords: 10,
minScore: 0.3,
language: 'en'
)
);
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
if ($result->getKeywords()) {
echo "Extracted Keywords:\n";
foreach ($result->getKeywords() as $index => $keyword) {
echo ($index + 1) . ". " . $keyword . "\n";
}
} else {
echo "No keywords extracted.\n";
}
?>
```

View File

@@ -0,0 +1,22 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.8,
detectMultiple: false
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Detected language: " . $result->getLanguage() . "\n";
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
?>
```

View File

@@ -0,0 +1,30 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.8,
detectMultiple: true
)
);
$result = Kreuzberg::extractFileSync('multilingual_document.pdf', null, $config);
echo "Detected languages: ";
$languages = $result->getDetectedLanguages();
if ($languages) {
echo implode(", ", $languages) . "\n";
} else {
echo "None\n";
}
echo "Primary language: " . $result->getLanguage() . "\n";
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
?>
```

View File

@@ -0,0 +1,281 @@
```php title="performance_tuning.php"
<?php
declare(strict_types=1);
/**
* Performance Tuning and Optimization
*
* Optimize document extraction for speed and resource usage.
* Tips and techniques for processing large volumes of documents.
*/
require_once __DIR__ . '/vendor/autoload.php';
use Kreuzberg\Kreuzberg;
use Kreuzberg\Config\ExtractionConfig;
use function Kreuzberg\extract_file;
use function Kreuzberg\batch_extract_files;
function benchmark(callable $fn, string $label): void
{
$startTime = microtime(true);
$startMemory = memory_get_usage();
$result = $fn();
$elapsed = microtime(true) - $startTime;
$memoryUsed = memory_get_usage() - $startMemory;
echo "$label:\n";
echo " Time: " . number_format($elapsed, 4) . "s\n";
echo " Memory: " . number_format($memoryUsed / 1024 / 1024, 2) . " MB\n";
echo " Peak memory: " . number_format(memory_get_peak_usage() / 1024 / 1024, 2) . " MB\n\n";
}
$files = array_filter(
['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf', 'doc5.pdf'],
'file_exists'
);
if (!empty($files)) {
echo "Performance Comparison:\n";
echo str_repeat('=', 60) . "\n\n";
benchmark(function () use ($files) {
$results = [];
foreach ($files as $file) {
$results[] = extract_file($file);
}
return $results;
}, "Single file processing");
benchmark(function () use ($files) {
return batch_extract_files($files);
}, "Batch processing (parallel)");
}
$fastConfig = new ExtractionConfig(
extractImages: false,
extractTables: false,
preserveFormatting: false
);
$standardConfig = new ExtractionConfig(
extractImages: true,
extractTables: true,
preserveFormatting: true
);
$testFile = 'large_document.pdf';
if (file_exists($testFile)) {
echo "Configuration Impact:\n";
echo str_repeat('=', 60) . "\n\n";
benchmark(function () use ($testFile, $fastConfig) {
$kreuzberg = new Kreuzberg($fastConfig);
return $kreuzberg->extractFile($testFile);
}, "Fast config (minimal features)");
benchmark(function () use ($testFile, $standardConfig) {
$kreuzberg = new Kreuzberg($standardConfig);
return $kreuzberg->extractFile($testFile);
}, "Standard config (all features)");
}
function processLargeDocumentEfficiently(string $filePath): void
{
$config = new ExtractionConfig(
page: new \Kreuzberg\Config\PageConfig(
extractPages: true
),
extractImages: false
);
$kreuzberg = new Kreuzberg($config);
$result = $kreuzberg->extractFile($filePath);
echo "Processing large document page by page:\n";
foreach ($result->pages ?? [] as $page) {
$pageContent = $page->content;
unset($pageContent);
echo " Processed page {$page->pageNumber}\n";
}
unset($result);
gc_collect_cycles();
}
if (file_exists('huge_document.pdf')) {
processLargeDocumentEfficiently('huge_document.pdf');
}
function findOptimalBatchSize(array $files): int
{
$batchSizes = [1, 5, 10, 20, 50];
$results = [];
foreach ($batchSizes as $size) {
$batches = array_chunk($files, $size);
$startTime = microtime(true);
foreach ($batches as $batch) {
batch_extract_files($batch);
}
$elapsed = microtime(true) - $startTime;
$throughput = count($files) / $elapsed;
$results[$size] = $throughput;
echo "Batch size $size: " . number_format($throughput, 2) . " files/sec\n";
}
arsort($results);
return array_key_first($results);
}
if (!empty($files) && count($files) >= 5) {
echo "\nFinding optimal batch size:\n";
echo str_repeat('=', 60) . "\n";
$optimalSize = findOptimalBatchSize($files);
echo "\nOptimal batch size: $optimalSize\n\n";
}
class ResourceMonitor
{
private float $startTime;
private int $startMemory;
private array $checkpoints = [];
public function __construct()
{
$this->startTime = microtime(true);
$this->startMemory = memory_get_usage();
}
public function checkpoint(string $label): void
{
$this->checkpoints[] = [
'label' => $label,
'time' => microtime(true) - $this->startTime,
'memory' => memory_get_usage() - $this->startMemory,
'peak' => memory_get_peak_usage(),
];
}
public function report(): void
{
echo "Resource Monitor Report:\n";
echo str_repeat('=', 60) . "\n";
foreach ($this->checkpoints as $checkpoint) {
printf("%-30s | Time: %6.3fs | Mem: %6.2f MB\n",
$checkpoint['label'],
$checkpoint['time'],
$checkpoint['memory'] / 1024 / 1024
);
}
echo "\nPeak memory: " . number_format(
memory_get_peak_usage() / 1024 / 1024, 2
) . " MB\n";
}
}
$monitor = new ResourceMonitor();
$kreuzberg = new Kreuzberg();
$monitor->checkpoint("Kreuzberg initialized");
$result = $kreuzberg->extractFile('document.pdf');
$monitor->checkpoint("Document extracted");
$words = str_word_count($result->content);
$monitor->checkpoint("Word count completed");
unset($result);
gc_collect_cycles();
$monitor->checkpoint("Memory freed");
$monitor->report();
function processConcurrently(array $files, int $workers = 4): array
{
$chunks = array_chunk($files, ceil(count($files) / $workers));
$results = [];
foreach ($chunks as $chunk) {
$chunkResults = batch_extract_files($chunk);
$results = array_merge($results, $chunkResults);
}
return $results;
}
class CachedKreuzberg
{
private array $cache = [];
private int $maxCacheSize;
public function __construct(
private Kreuzberg $kreuzberg,
int $maxCacheSize = 100
) {
$this->maxCacheSize = $maxCacheSize;
}
public function extractFile(string $filePath): \Kreuzberg\Types\ExtractionResult
{
$cacheKey = md5($filePath . filemtime($filePath));
if (isset($this->cache[$cacheKey])) {
return $this->cache[$cacheKey];
}
$result = $this->kreuzberg->extractFile($filePath);
if (count($this->cache) >= $this->maxCacheSize) {
array_shift($this->cache);
}
$this->cache[$cacheKey] = $result;
return $result;
}
public function clearCache(): void
{
$this->cache = [];
}
}
$cachedKreuzberg = new CachedKreuzberg(new Kreuzberg(), maxCacheSize: 50);
echo "\nCached extraction performance:\n";
echo str_repeat('=', 60) . "\n";
$file = 'document.pdf';
if (file_exists($file)) {
benchmark(function () use ($cachedKreuzberg, $file) {
return $cachedKreuzberg->extractFile($file);
}, "First extraction (uncached)");
benchmark(function () use ($cachedKreuzberg, $file) {
return $cachedKreuzberg->extractFile($file);
}, "Second extraction (cached)");
}
echo "\nPerformance Tips:\n";
echo str_repeat('=', 60) . "\n";
echo "1. Use batch processing for multiple files\n";
echo "2. Disable features you don't need (images, tables, OCR)\n";
echo "3. Process pages individually for very large documents\n";
echo "4. Use appropriate batch sizes (test to find optimal)\n";
echo "5. Implement caching for frequently accessed documents\n";
echo "6. Monitor memory usage and clear results when done\n";
echo "7. Consider using worker processes for high throughput\n";
echo "8. Increase PHP memory_limit for large documents\n";
```

View File

@@ -0,0 +1,23 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$config = new ExtractionConfig(
enableQualityProcessing: true,
useCache: true
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
if ($result->getQualityScore() !== null) {
echo "Quality score: " . $result->getQualityScore() . "\n";
}
if ($result->getProcessingTime() !== null) {
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
}
?>
```

View File

@@ -0,0 +1,27 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$config = new ExtractionConfig(
enableQualityProcessing: true
);
$result = Kreuzberg::extractFileSync('scanned_document.pdf', null, $config);
if ($result->getQualityScore() !== null) {
$score = $result->getQualityScore();
if ($score < 0.5) {
echo "Warning: Low quality extraction (" . round($score, 2) . ")\n";
} else {
echo "Quality score: " . round($score, 2) . "\n";
}
} else {
echo "Quality score not available.\n";
}
echo "Extracted text length: " . strlen($result->getContent()) . " characters\n";
?>
```

View File

@@ -0,0 +1,20 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\TokenReductionOptions;
$config = new ExtractionConfig(
tokenReduction: new TokenReductionOptions(
mode: 'moderate',
preserveImportantWords: true
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
?>
```

View File

@@ -0,0 +1,26 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\TokenReductionOptions;
$config = new ExtractionConfig(
tokenReduction: new TokenReductionOptions(
mode: 'moderate',
preserveImportantWords: true
)
);
$result = Kreuzberg::extractFileSync('verbose_document.pdf', null, $config);
if ($result->getTokenCount() !== null) {
echo "Original token count: " . $result->getTokenCount() . "\n";
}
// Access the reduced content
echo "Reduced content length: " . strlen($result->getContent()) . " characters\n";
echo "Content preview: " . substr($result->getContent(), 0, 100) . "...\n";
?>
```

View File

@@ -0,0 +1,70 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\EmbeddingConfig;
class VectorRecord {
public function __construct(
public string $id,
public string $content,
public array $embedding,
public array $metadata
) {}
}
function extractAndVectorize(
string $documentPath,
string $documentId
): array {
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 512,
overlap: 50,
embedding: new EmbeddingConfig(
normalize: true,
batchSize: 32
)
)
);
$result = Kreuzberg::extractFileSync($documentPath, null, $config);
$records = [];
if ($result->getChunks()) {
foreach ($result->getChunks() as $index => $chunk) {
$embedding = $chunk->getEmbedding();
if ($embedding) {
$metadata = [
'document_id' => $documentId,
'chunk_index' => (string)$index,
'content_length' => (string)strlen($chunk->getContent()),
];
$records[] = new VectorRecord(
id: "{$documentId}_chunk_{$index}",
content: $chunk->getContent(),
embedding: $embedding,
metadata: $metadata
);
}
}
}
return $records;
}
// Usage
$records = extractAndVectorize('research_paper.pdf', 'doc_123');
foreach ($records as $record) {
echo "Vector ID: " . $record->id . "\n";
echo "Content length: " . strlen($record->content) . " characters\n";
echo "Embedding dimension: " . count($record->embedding) . "\n";
echo "---\n";
}
?>
```