This commit is contained in:
80
docs/snippets/php/api/combining_all_features.md
Normal file
80
docs/snippets/php/api/combining_all_features.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\ChunkSizing;
|
||||
use Kreuzberg\ImageExtractionConfig;
|
||||
use Kreuzberg\OutputFormat;
|
||||
|
||||
// Build config with OCR, chunking, and image extraction
|
||||
$config = new ExtractionConfig(
|
||||
null, // caching
|
||||
false, // force_ocr
|
||||
null, // max_concurrent_extractions
|
||||
null, // cache_dir
|
||||
OutputFormat::Markdown, // output_format
|
||||
true, // include_document_structure
|
||||
true, // enable_quality_processing
|
||||
true, // use_cache
|
||||
null, // use_diffs
|
||||
null, // keep_empty_chunks
|
||||
);
|
||||
|
||||
// Set OCR: Tesseract with English language
|
||||
$ocrConfig = new OcrConfig(
|
||||
'tesseract', // backend
|
||||
'eng', // language
|
||||
null, // page_count_hint
|
||||
null, // psm_mode
|
||||
null, // use_gpu
|
||||
null, // languages
|
||||
null, // fast_mode
|
||||
null, // fast_weight
|
||||
null, // min_confidence
|
||||
);
|
||||
$config->setOcr($ocrConfig);
|
||||
|
||||
// Set chunking: semantic markdown chunks ~800 chars, 100-char overlap
|
||||
$chunkingConfig = new ChunkingConfig(
|
||||
800, // max_characters
|
||||
100, // overlap
|
||||
true, // trim
|
||||
'Markdown', // chunker_type
|
||||
null, // preset
|
||||
true, // prepend_heading_context
|
||||
null, // topic_threshold
|
||||
);
|
||||
$config->setChunking($chunkingConfig);
|
||||
|
||||
// Set image extraction
|
||||
$imageConfig = new ImageExtractionConfig(
|
||||
true, // extract_images
|
||||
null, // image_min_width
|
||||
null, // image_min_height
|
||||
null, // image_output_format
|
||||
null, // image_compression_level
|
||||
);
|
||||
$config->setImages($imageConfig);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('report.pdf', null, $config);
|
||||
|
||||
echo "Content (" . strlen($result->getContent()) . " chars):\n";
|
||||
echo substr($result->getContent(), 0, 200) . "\n\n";
|
||||
|
||||
if ($result->getChunks() !== null) {
|
||||
echo "Chunks: " . count($result->getChunks()) . "\n";
|
||||
}
|
||||
echo "Tables: " . count($result->getTables()) . "\n";
|
||||
|
||||
if ($result->getDetectedLanguages() !== null) {
|
||||
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
|
||||
if ($result->getExtractionMethod() !== null) {
|
||||
echo "Extraction method: " . $result->getExtractionMethod() . "\n";
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user