This commit is contained in:
300
docs/snippets/php/README.md
Normal file
300
docs/snippets/php/README.md
Normal file
@@ -0,0 +1,300 @@
|
||||
# Kreuzberg PHP Snippets
|
||||
|
||||
Comprehensive code examples for the Kreuzberg PHP bindings. These snippets demonstrate all major features and use cases.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```text
|
||||
php/
|
||||
├── installation/ # Getting started, setup, requirements
|
||||
├── quickstart/ # Basic usage examples
|
||||
├── configuration/ # Configuration classes and options
|
||||
├── extraction/ # Document extraction examples
|
||||
├── async/ # Async extraction with DeferredResult
|
||||
├── ocr/ # OCR and image preprocessing
|
||||
├── chunking/ # Text chunking for RAG
|
||||
├── embeddings/ # Vector embeddings and semantic search
|
||||
├── advanced/ # Error handling, performance tuning
|
||||
├── cache/ # Caching strategies
|
||||
├── cli/ # Command-line tools
|
||||
└── benchmarking/ # Performance testing
|
||||
```
|
||||
|
||||
## Installation (3 snippets)
|
||||
|
||||
### Composer_install.php
|
||||
|
||||
Installing Kreuzberg via Composer and verifying the extension is loaded.
|
||||
|
||||
### Extension_setup.php
|
||||
|
||||
Setting up the native PHP extension (kreuzberg.so/.dll) and checking for optional dependencies (Tesseract, ONNX Runtime).
|
||||
|
||||
### Requirements_check.php
|
||||
|
||||
Comprehensive system requirements verification script.
|
||||
|
||||
## Quickstart (4 snippets)
|
||||
|
||||
### Basic_extraction_oop.php
|
||||
|
||||
Simple document extraction using the object-oriented API.
|
||||
|
||||
### Basic_extraction_procedural.php
|
||||
|
||||
Simple extraction using the procedural API for more concise code.
|
||||
|
||||
### Extract_from_bytes.php
|
||||
|
||||
Extract content from file data in memory (useful for uploaded files).
|
||||
|
||||
### Mime_type_detection.php
|
||||
|
||||
Automatic MIME type detection from file paths or content.
|
||||
|
||||
## Configuration (5 snippets)
|
||||
|
||||
### Extraction_config.php
|
||||
|
||||
Main ExtractionConfig class - controlling all aspects of extraction.
|
||||
|
||||
### Pdf_config.php
|
||||
|
||||
PDF-specific settings including image quality and extraction methods.
|
||||
|
||||
### Page_config.php
|
||||
|
||||
Per-page extraction and page markers for maintaining document structure.
|
||||
|
||||
### Language_detection_config.php
|
||||
|
||||
Automatic language detection for multilingual documents.
|
||||
|
||||
### Keyword_config.php
|
||||
|
||||
Automatic keyword extraction for document categorization.
|
||||
|
||||
## Extraction (7 snippets)
|
||||
|
||||
### Pdf_extraction.php
|
||||
|
||||
Extract text, tables, and images from PDF files with various configurations.
|
||||
|
||||
### Docx_extraction.php
|
||||
|
||||
Extract content from Microsoft Word documents including metadata and tables.
|
||||
|
||||
### Image_extraction.php
|
||||
|
||||
Extract embedded images from documents with optional OCR.
|
||||
|
||||
### Batch_processing.php
|
||||
|
||||
Process multiple documents in parallel for maximum performance.
|
||||
|
||||
### Table_extraction.php
|
||||
|
||||
Extract and process tables, export to CSV, JSON, and HTML formats.
|
||||
|
||||
### Metadata_extraction.php
|
||||
|
||||
Extract document metadata (title, author, dates, keywords).
|
||||
|
||||
### Multi_format.php
|
||||
|
||||
Handle various document formats with format-specific processing.
|
||||
|
||||
## OCR (3 snippets)
|
||||
|
||||
### Basic_ocr.php
|
||||
|
||||
Basic OCR with Tesseract for scanned documents and images.
|
||||
|
||||
### Advanced_ocr.php
|
||||
|
||||
Advanced OCR configuration with Tesseract PSM modes and table detection.
|
||||
|
||||
### Image_preprocessing.php
|
||||
|
||||
Image preprocessing for better OCR accuracy (denoising, deskewing, sharpening).
|
||||
|
||||
## Chunking (1 snippet)
|
||||
|
||||
### Basic_chunking.php
|
||||
|
||||
Split documents into chunks for RAG applications with various strategies.
|
||||
|
||||
## Embeddings (2 snippets)
|
||||
|
||||
### Basic_embeddings.php
|
||||
|
||||
Generate vector embeddings for semantic search and similarity matching.
|
||||
|
||||
### Semantic_search.php
|
||||
|
||||
Build a semantic search system using document embeddings.
|
||||
|
||||
## Advanced (2 snippets)
|
||||
|
||||
### Error_handling.php
|
||||
|
||||
Robust error handling, retry strategies, and validation.
|
||||
|
||||
### Performance_tuning.php
|
||||
|
||||
Performance optimization tips and techniques.
|
||||
|
||||
## Cache (1 snippet)
|
||||
|
||||
### Disk_cache.php
|
||||
|
||||
File-based caching to avoid re-processing documents.
|
||||
|
||||
## CLI (2 snippets)
|
||||
|
||||
### Basic_cli.php
|
||||
|
||||
Simple command-line interface for document extraction.
|
||||
|
||||
### Cli_with_config.php
|
||||
|
||||
Advanced CLI with support for various extraction options.
|
||||
|
||||
## Benchmarking (1 snippet)
|
||||
|
||||
### Simple_benchmark.php
|
||||
|
||||
Benchmark extraction performance across different configurations.
|
||||
|
||||
## Usage Patterns
|
||||
|
||||
### Basic Extraction
|
||||
|
||||
```php title="Basic Extraction"
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
echo $result->content;
|
||||
```
|
||||
|
||||
### With Configuration
|
||||
|
||||
```php title="With Configuration"
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng'),
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
```
|
||||
|
||||
### Procedural API
|
||||
|
||||
```php title="Procedural API"
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
echo $result->content;
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```php title="Batch Processing"
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
||||
$results = batch_extract_files($files);
|
||||
```
|
||||
|
||||
## Async Extraction (4 snippets)
|
||||
|
||||
### Async_extract_file.php
|
||||
|
||||
Async file extraction with DeferredResult polling and blocking patterns.
|
||||
|
||||
### Async_batch.php
|
||||
|
||||
Async batch extraction with timeout-based waiting.
|
||||
|
||||
### Async_amp_bridge.php
|
||||
|
||||
Integration with Amp v3+ framework using AmpBridge::toFuture().
|
||||
|
||||
### Async_react_bridge.php
|
||||
|
||||
Integration with ReactPHP framework using ReactBridge::toPromise().
|
||||
|
||||
## Key Features Demonstrated
|
||||
|
||||
- **90+ File Formats**: PDF, DOCX, XLSX, PPTX, images, HTML, and more
|
||||
- **Async Extraction**: Non-blocking extraction with DeferredResult pattern
|
||||
- **OCR Support**: Tesseract integration with preprocessing
|
||||
- **Table Extraction**: Extract structured tables with multiple export formats
|
||||
- **Metadata**: Rich metadata extraction for all formats
|
||||
- **Batch Processing**: Parallel processing for high throughput
|
||||
- **Text Chunking**: Intelligent segmentation for RAG applications
|
||||
- **Embeddings**: Vector embeddings for semantic search
|
||||
- **Type Safety**: Full PHP 8.1+ type hints and readonly classes
|
||||
- **Error Handling**: Comprehensive error handling patterns
|
||||
- **Performance**: Optimization techniques and benchmarking
|
||||
|
||||
## Requirements
|
||||
|
||||
- PHP 8.1.0 or higher
|
||||
- Kreuzberg PHP extension (kreuzberg.so/.dll)
|
||||
- Composer package: kreuzberg/Kreuzberg
|
||||
- Optional: Tesseract OCR (for OCR functionality)
|
||||
- Optional: ONNX Runtime (for embeddings)
|
||||
|
||||
## Testing Snippets
|
||||
|
||||
Each snippet is designed to be self-contained and runnable. To test:
|
||||
|
||||
1. Install dependencies:
|
||||
|
||||
```bash
|
||||
composer require kreuzberg/kreuzberg
|
||||
```
|
||||
|
||||
2. Ensure the extension is loaded:
|
||||
|
||||
```bash
|
||||
php -m | grep kreuzberg
|
||||
```
|
||||
|
||||
3. Run any snippet:
|
||||
|
||||
```bash
|
||||
php docs/snippets/php/quickstart/basic_extraction_oop.php
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use batch processing** for multiple files
|
||||
2. **Disable unnecessary features** (OCR, embeddings) if not needed
|
||||
3. **Implement caching** for often accessed documents
|
||||
4. **Handle errors gracefully** with try-catch blocks
|
||||
5. **Monitor memory usage** for large documents
|
||||
6. **Use type hints** for better IDE support and safety
|
||||
|
||||
## Contributing
|
||||
|
||||
These snippets follow these conventions:
|
||||
|
||||
- All files use `declare(strict_types=1)`
|
||||
- Code is wrapped in ````php` markdown code blocks
|
||||
- Clear comments explain what each snippet demonstrates
|
||||
- Both OOP and procedural APIs are shown where applicable
|
||||
- Examples are realistic and production-ready
|
||||
|
||||
## Links
|
||||
|
||||
- **Documentation**: <https://kreuzberg.dev>
|
||||
- **GitHub**: <https://github.com/kreuzberg-dev/Kreuzberg>
|
||||
- **Issues**: <https://github.com/kreuzberg-dev/kreuzberg/issues>
|
||||
- **Package**: <https://packagist.org/packages/kreuzberg/Kreuzberg>
|
||||
41
docs/snippets/php/advanced/chunk_page_mapping.md
Normal file
41
docs/snippets/php/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50
|
||||
),
|
||||
pages: new PageConfig(
|
||||
extractPages: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata) {
|
||||
$firstPage = $metadata->getFirstPage();
|
||||
$lastPage = $metadata->getLastPage();
|
||||
|
||||
if ($firstPage !== null && $lastPage !== null) {
|
||||
if ($firstPage === $lastPage) {
|
||||
$pageRange = "Page " . $firstPage;
|
||||
} else {
|
||||
$pageRange = "Pages " . $firstPage . "-" . $lastPage;
|
||||
}
|
||||
echo "Chunk: " . substr($chunk->getContent(), 0, 50) . "... (" . $pageRange . ")\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
79
docs/snippets/php/advanced/chunking_config.md
Normal file
79
docs/snippets/php/advanced/chunking_config.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
// Basic chunking
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Number of chunks: " . count($result->getChunks()) . "\n";
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Semantic Chunking"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'semantic',
|
||||
topicThreshold: 0.75
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Chunks with topic-based boundaries: " . count($result->getChunks()) . "\n";
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Prepend Heading Context"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'markdown',
|
||||
prependHeadingContext: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.md', null, $config);
|
||||
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata && $metadata->getHeadingContext()) {
|
||||
$headings = $metadata->getHeadingContext()->getHeadings();
|
||||
foreach ($headings as $heading) {
|
||||
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
|
||||
}
|
||||
}
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
39
docs/snippets/php/advanced/chunking_rag.md
Normal file
39
docs/snippets/php/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata) {
|
||||
echo "Chunk " . ($metadata->getChunkIndex() + 1) . "/" . $metadata->getTotalChunks() . "\n";
|
||||
echo "Position: " . $metadata->getByteStart() . "-" . $metadata->getByteEnd() . "\n";
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
|
||||
if ($chunk->getEmbedding()) {
|
||||
echo "Embedding: " . count($chunk->getEmbedding()) . " dimensions\n";
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
39
docs/snippets/php/advanced/embedding_with_chunking.md
Normal file
39
docs/snippets/php/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1024,
|
||||
overlap: 100,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32,
|
||||
showDownloadProgress: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
|
||||
$embedding = $chunk->getEmbedding();
|
||||
if ($embedding) {
|
||||
echo "Embedding dimension: " . count($embedding) . "\n";
|
||||
echo "First 5 values: ";
|
||||
echo implode(", ", array_slice($embedding, 0, 5));
|
||||
echo "\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
313
docs/snippets/php/advanced/error_handling.php
Normal file
313
docs/snippets/php/advanced/error_handling.php
Normal file
@@ -0,0 +1,313 @@
|
||||
```php title="error_handling.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Error Handling
|
||||
*
|
||||
* Robust error handling for document extraction operations.
|
||||
* Handle failures gracefully and implement retry strategies.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
try {
|
||||
$result = extract_file('document.pdf');
|
||||
echo "Extraction successful!\n";
|
||||
echo "Content length: " . strlen($result->content) . "\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Error: " . $e->getMessage() . "\n";
|
||||
echo "Code: " . $e->getCode() . "\n";
|
||||
error_log("Kreuzberg extraction failed: " . $e->getMessage());
|
||||
}
|
||||
|
||||
function safeExtract(string $filePath): ?string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
error_log("File not found: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!is_readable($filePath)) {
|
||||
error_log("File not readable: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$content = safeExtract('document.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted document\n";
|
||||
} else {
|
||||
echo "Failed to extract document\n";
|
||||
}
|
||||
|
||||
function extractWithRetry(
|
||||
string $filePath,
|
||||
int $maxRetries = 3,
|
||||
int $initialDelay = 1000
|
||||
): ?string {
|
||||
$attempt = 0;
|
||||
$delay = $initialDelay;
|
||||
|
||||
while ($attempt < $maxRetries) {
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
$attempt++;
|
||||
if ($attempt >= $maxRetries) {
|
||||
error_log("Max retries exceeded for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
echo "Attempt $attempt failed, retrying in {$delay}ms...\n";
|
||||
usleep($delay * 1000);
|
||||
$delay *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithRetry('potentially_corrupt.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Document extracted after retry\n";
|
||||
}
|
||||
|
||||
function validateExtractionResult(string $filePath): bool
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
error_log("Empty content extracted from $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
$minExpectedChars = 100;
|
||||
if (strlen($result->content) < $minExpectedChars) {
|
||||
error_log("Content too short from $filePath: " . strlen($result->content) . " chars");
|
||||
return false;
|
||||
}
|
||||
|
||||
$nonPrintableRatio = (strlen($result->content) - strlen(preg_replace('/[^\x20-\x7E\x0A\x0D]/', '', $result->content))) / strlen($result->content);
|
||||
if ($nonPrintableRatio > 0.5) {
|
||||
error_log("High non-printable character ratio in $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Validation failed for $filePath: " . $e->getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (validateExtractionResult('document.pdf')) {
|
||||
echo "Extraction result validated successfully\n";
|
||||
} else {
|
||||
echo "Extraction result validation failed\n";
|
||||
}
|
||||
|
||||
$files = glob('documents/*.pdf');
|
||||
$successful = [];
|
||||
$failed = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
try {
|
||||
$result = extract_file($file);
|
||||
$successful[] = [
|
||||
'file' => $file,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
];
|
||||
} catch (KreuzbergException $e) {
|
||||
$failed[] = [
|
||||
'file' => $file,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nBatch Processing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Successful: " . count($successful) . "\n";
|
||||
echo "Failed: " . count($failed) . "\n\n";
|
||||
|
||||
if (!empty($failed)) {
|
||||
echo "Failed files:\n";
|
||||
foreach ($failed as $failure) {
|
||||
echo " - {$failure['file']}: {$failure['error']}\n";
|
||||
}
|
||||
}
|
||||
|
||||
function extractWithFallback(string $filePath): ?string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
if (!empty($result->content)) {
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Normal extraction failed, trying fallback strategies...\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($filePath);
|
||||
if (!empty($result->content)) {
|
||||
echo "Fallback: OCR extraction succeeded\n";
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "OCR fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$content = file_get_contents($filePath);
|
||||
if (!empty($content)) {
|
||||
echo "Fallback: Reading as plain text\n";
|
||||
return $content;
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
echo "Plain text fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithFallback('problematic_file.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted with fallback\n";
|
||||
}
|
||||
|
||||
function extractWithTimeout(string $filePath, int $timeoutSeconds = 30): ?string
|
||||
{
|
||||
$startTime = time();
|
||||
|
||||
try {
|
||||
set_time_limit($timeoutSeconds);
|
||||
|
||||
$result = extract_file($filePath);
|
||||
$elapsed = time() - $startTime;
|
||||
|
||||
if ($elapsed > $timeoutSeconds) {
|
||||
error_log("Extraction exceeded timeout for $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error: " . $e->getMessage());
|
||||
return null;
|
||||
} finally {
|
||||
set_time_limit(0);
|
||||
}
|
||||
}
|
||||
|
||||
class DocumentExtractionException extends \Exception
|
||||
{
|
||||
public function __construct(
|
||||
string $message,
|
||||
public readonly string $filePath,
|
||||
public readonly ?string $mimeType = null,
|
||||
?\Throwable $previous = null
|
||||
) {
|
||||
parent::__construct($message, 0, $previous);
|
||||
}
|
||||
}
|
||||
|
||||
function extractOrThrow(string $filePath): string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
throw new DocumentExtractionException(
|
||||
"No content extracted",
|
||||
$filePath,
|
||||
$result->mimeType
|
||||
);
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
throw new DocumentExtractionException(
|
||||
"Extraction failed: " . $e->getMessage(),
|
||||
$filePath,
|
||||
previous: $e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$content = extractOrThrow('document.pdf');
|
||||
echo "Content: " . substr($content, 0, 100) . "...\n";
|
||||
} catch (DocumentExtractionException $e) {
|
||||
echo "Failed to extract {$e->filePath}\n";
|
||||
echo "Reason: {$e->getMessage()}\n";
|
||||
if ($e->mimeType) {
|
||||
echo "MIME type: {$e->mimeType}\n";
|
||||
}
|
||||
}
|
||||
|
||||
class LoggingKreuzberg
|
||||
{
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
private \Psr\Log\LoggerInterface $logger
|
||||
) {}
|
||||
|
||||
public function extractFile(string $filePath, ?string $mimeType = null): ?\Kreuzberg\Types\ExtractionResult
|
||||
{
|
||||
$this->logger->info("Starting extraction", ['file' => $filePath]);
|
||||
$startTime = microtime(true);
|
||||
|
||||
try {
|
||||
$result = $this->kreuzberg->extractFile($filePath, $mimeType);
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->info("Extraction successful", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
]);
|
||||
|
||||
return $result;
|
||||
} catch (KreuzbergException $e) {
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->error("Extraction failed", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
]);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/php/advanced/keyword_extraction_config.md
Normal file
26
docs/snippets/php/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
foreach ($result->getKeywords() as $keyword) {
|
||||
echo $keyword . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
29
docs/snippets/php/advanced/keyword_extraction_example.md
Normal file
29
docs/snippets/php/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.3,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('research_paper.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
echo "Extracted Keywords:\n";
|
||||
foreach ($result->getKeywords() as $index => $keyword) {
|
||||
echo ($index + 1) . ". " . $keyword . "\n";
|
||||
}
|
||||
} else {
|
||||
echo "No keywords extracted.\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
22
docs/snippets/php/advanced/language_detection_config.md
Normal file
22
docs/snippets/php/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Detected language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
@@ -0,0 +1,30 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('multilingual_document.pdf', null, $config);
|
||||
|
||||
echo "Detected languages: ";
|
||||
$languages = $result->getDetectedLanguages();
|
||||
if ($languages) {
|
||||
echo implode(", ", $languages) . "\n";
|
||||
} else {
|
||||
echo "None\n";
|
||||
}
|
||||
|
||||
echo "Primary language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
281
docs/snippets/php/advanced/performance_tuning.php
Normal file
281
docs/snippets/php/advanced/performance_tuning.php
Normal file
@@ -0,0 +1,281 @@
|
||||
```php title="performance_tuning.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Performance Tuning and Optimization
|
||||
*
|
||||
* Optimize document extraction for speed and resource usage.
|
||||
* Tips and techniques for processing large volumes of documents.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
function benchmark(callable $fn, string $label): void
|
||||
{
|
||||
$startTime = microtime(true);
|
||||
$startMemory = memory_get_usage();
|
||||
|
||||
$result = $fn();
|
||||
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
$memoryUsed = memory_get_usage() - $startMemory;
|
||||
|
||||
echo "$label:\n";
|
||||
echo " Time: " . number_format($elapsed, 4) . "s\n";
|
||||
echo " Memory: " . number_format($memoryUsed / 1024 / 1024, 2) . " MB\n";
|
||||
echo " Peak memory: " . number_format(memory_get_peak_usage() / 1024 / 1024, 2) . " MB\n\n";
|
||||
}
|
||||
|
||||
$files = array_filter(
|
||||
['doc1.pdf', 'doc2.pdf', 'doc3.pdf', 'doc4.pdf', 'doc5.pdf'],
|
||||
'file_exists'
|
||||
);
|
||||
|
||||
if (!empty($files)) {
|
||||
echo "Performance Comparison:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
benchmark(function () use ($files) {
|
||||
$results = [];
|
||||
foreach ($files as $file) {
|
||||
$results[] = extract_file($file);
|
||||
}
|
||||
return $results;
|
||||
}, "Single file processing");
|
||||
|
||||
benchmark(function () use ($files) {
|
||||
return batch_extract_files($files);
|
||||
}, "Batch processing (parallel)");
|
||||
}
|
||||
|
||||
$fastConfig = new ExtractionConfig(
|
||||
extractImages: false,
|
||||
extractTables: false,
|
||||
preserveFormatting: false
|
||||
);
|
||||
|
||||
$standardConfig = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
);
|
||||
|
||||
$testFile = 'large_document.pdf';
|
||||
if (file_exists($testFile)) {
|
||||
echo "Configuration Impact:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
benchmark(function () use ($testFile, $fastConfig) {
|
||||
$kreuzberg = new Kreuzberg($fastConfig);
|
||||
return $kreuzberg->extractFile($testFile);
|
||||
}, "Fast config (minimal features)");
|
||||
|
||||
benchmark(function () use ($testFile, $standardConfig) {
|
||||
$kreuzberg = new Kreuzberg($standardConfig);
|
||||
return $kreuzberg->extractFile($testFile);
|
||||
}, "Standard config (all features)");
|
||||
}
|
||||
|
||||
function processLargeDocumentEfficiently(string $filePath): void
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
page: new \Kreuzberg\Config\PageConfig(
|
||||
extractPages: true
|
||||
),
|
||||
extractImages: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($filePath);
|
||||
|
||||
echo "Processing large document page by page:\n";
|
||||
|
||||
foreach ($result->pages ?? [] as $page) {
|
||||
$pageContent = $page->content;
|
||||
|
||||
unset($pageContent);
|
||||
|
||||
echo " Processed page {$page->pageNumber}\n";
|
||||
}
|
||||
|
||||
unset($result);
|
||||
gc_collect_cycles();
|
||||
}
|
||||
|
||||
if (file_exists('huge_document.pdf')) {
|
||||
processLargeDocumentEfficiently('huge_document.pdf');
|
||||
}
|
||||
|
||||
function findOptimalBatchSize(array $files): int
|
||||
{
|
||||
$batchSizes = [1, 5, 10, 20, 50];
|
||||
$results = [];
|
||||
|
||||
foreach ($batchSizes as $size) {
|
||||
$batches = array_chunk($files, $size);
|
||||
$startTime = microtime(true);
|
||||
|
||||
foreach ($batches as $batch) {
|
||||
batch_extract_files($batch);
|
||||
}
|
||||
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
$throughput = count($files) / $elapsed;
|
||||
|
||||
$results[$size] = $throughput;
|
||||
|
||||
echo "Batch size $size: " . number_format($throughput, 2) . " files/sec\n";
|
||||
}
|
||||
|
||||
arsort($results);
|
||||
return array_key_first($results);
|
||||
}
|
||||
|
||||
if (!empty($files) && count($files) >= 5) {
|
||||
echo "\nFinding optimal batch size:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
$optimalSize = findOptimalBatchSize($files);
|
||||
echo "\nOptimal batch size: $optimalSize\n\n";
|
||||
}
|
||||
|
||||
class ResourceMonitor
|
||||
{
|
||||
private float $startTime;
|
||||
private int $startMemory;
|
||||
private array $checkpoints = [];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->startTime = microtime(true);
|
||||
$this->startMemory = memory_get_usage();
|
||||
}
|
||||
|
||||
public function checkpoint(string $label): void
|
||||
{
|
||||
$this->checkpoints[] = [
|
||||
'label' => $label,
|
||||
'time' => microtime(true) - $this->startTime,
|
||||
'memory' => memory_get_usage() - $this->startMemory,
|
||||
'peak' => memory_get_peak_usage(),
|
||||
];
|
||||
}
|
||||
|
||||
public function report(): void
|
||||
{
|
||||
echo "Resource Monitor Report:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($this->checkpoints as $checkpoint) {
|
||||
printf("%-30s | Time: %6.3fs | Mem: %6.2f MB\n",
|
||||
$checkpoint['label'],
|
||||
$checkpoint['time'],
|
||||
$checkpoint['memory'] / 1024 / 1024
|
||||
);
|
||||
}
|
||||
|
||||
echo "\nPeak memory: " . number_format(
|
||||
memory_get_peak_usage() / 1024 / 1024, 2
|
||||
) . " MB\n";
|
||||
}
|
||||
}
|
||||
|
||||
$monitor = new ResourceMonitor();
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$monitor->checkpoint("Kreuzberg initialized");
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
$monitor->checkpoint("Document extracted");
|
||||
|
||||
$words = str_word_count($result->content);
|
||||
$monitor->checkpoint("Word count completed");
|
||||
|
||||
unset($result);
|
||||
gc_collect_cycles();
|
||||
$monitor->checkpoint("Memory freed");
|
||||
|
||||
$monitor->report();
|
||||
|
||||
function processConcurrently(array $files, int $workers = 4): array
|
||||
{
|
||||
$chunks = array_chunk($files, ceil(count($files) / $workers));
|
||||
$results = [];
|
||||
|
||||
foreach ($chunks as $chunk) {
|
||||
$chunkResults = batch_extract_files($chunk);
|
||||
$results = array_merge($results, $chunkResults);
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
class CachedKreuzberg
|
||||
{
|
||||
private array $cache = [];
|
||||
private int $maxCacheSize;
|
||||
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
int $maxCacheSize = 100
|
||||
) {
|
||||
$this->maxCacheSize = $maxCacheSize;
|
||||
}
|
||||
|
||||
public function extractFile(string $filePath): \Kreuzberg\Types\ExtractionResult
|
||||
{
|
||||
$cacheKey = md5($filePath . filemtime($filePath));
|
||||
|
||||
if (isset($this->cache[$cacheKey])) {
|
||||
return $this->cache[$cacheKey];
|
||||
}
|
||||
|
||||
$result = $this->kreuzberg->extractFile($filePath);
|
||||
|
||||
if (count($this->cache) >= $this->maxCacheSize) {
|
||||
array_shift($this->cache);
|
||||
}
|
||||
|
||||
$this->cache[$cacheKey] = $result;
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function clearCache(): void
|
||||
{
|
||||
$this->cache = [];
|
||||
}
|
||||
}
|
||||
|
||||
$cachedKreuzberg = new CachedKreuzberg(new Kreuzberg(), maxCacheSize: 50);
|
||||
|
||||
echo "\nCached extraction performance:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$file = 'document.pdf';
|
||||
if (file_exists($file)) {
|
||||
benchmark(function () use ($cachedKreuzberg, $file) {
|
||||
return $cachedKreuzberg->extractFile($file);
|
||||
}, "First extraction (uncached)");
|
||||
|
||||
benchmark(function () use ($cachedKreuzberg, $file) {
|
||||
return $cachedKreuzberg->extractFile($file);
|
||||
}, "Second extraction (cached)");
|
||||
}
|
||||
|
||||
echo "\nPerformance Tips:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "1. Use batch processing for multiple files\n";
|
||||
echo "2. Disable features you don't need (images, tables, OCR)\n";
|
||||
echo "3. Process pages individually for very large documents\n";
|
||||
echo "4. Use appropriate batch sizes (test to find optimal)\n";
|
||||
echo "5. Implement caching for frequently accessed documents\n";
|
||||
echo "6. Monitor memory usage and clear results when done\n";
|
||||
echo "7. Consider using worker processes for high throughput\n";
|
||||
echo "8. Increase PHP memory_limit for large documents\n";
|
||||
```
|
||||
23
docs/snippets/php/advanced/quality_processing_config.md
Normal file
23
docs/snippets/php/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true,
|
||||
useCache: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getQualityScore() !== null) {
|
||||
echo "Quality score: " . $result->getQualityScore() . "\n";
|
||||
}
|
||||
|
||||
if ($result->getProcessingTime() !== null) {
|
||||
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
27
docs/snippets/php/advanced/quality_processing_example.md
Normal file
27
docs/snippets/php/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned_document.pdf', null, $config);
|
||||
|
||||
if ($result->getQualityScore() !== null) {
|
||||
$score = $result->getQualityScore();
|
||||
if ($score < 0.5) {
|
||||
echo "Warning: Low quality extraction (" . round($score, 2) . ")\n";
|
||||
} else {
|
||||
echo "Quality score: " . round($score, 2) . "\n";
|
||||
}
|
||||
} else {
|
||||
echo "Quality score not available.\n";
|
||||
}
|
||||
|
||||
echo "Extracted text length: " . strlen($result->getContent()) . " characters\n";
|
||||
?>
|
||||
```
|
||||
20
docs/snippets/php/advanced/token_reduction_config.md
Normal file
20
docs/snippets/php/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
26
docs/snippets/php/advanced/token_reduction_example.md
Normal file
26
docs/snippets/php/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('verbose_document.pdf', null, $config);
|
||||
|
||||
if ($result->getTokenCount() !== null) {
|
||||
echo "Original token count: " . $result->getTokenCount() . "\n";
|
||||
}
|
||||
|
||||
// Access the reduced content
|
||||
echo "Reduced content length: " . strlen($result->getContent()) . " characters\n";
|
||||
echo "Content preview: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
70
docs/snippets/php/advanced/vector_database_integration.md
Normal file
70
docs/snippets/php/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
class VectorRecord {
|
||||
public function __construct(
|
||||
public string $id,
|
||||
public string $content,
|
||||
public array $embedding,
|
||||
public array $metadata
|
||||
) {}
|
||||
}
|
||||
|
||||
function extractAndVectorize(
|
||||
string $documentPath,
|
||||
string $documentId
|
||||
): array {
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 512,
|
||||
overlap: 50,
|
||||
embedding: new EmbeddingConfig(
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync($documentPath, null, $config);
|
||||
|
||||
$records = [];
|
||||
if ($result->getChunks()) {
|
||||
foreach ($result->getChunks() as $index => $chunk) {
|
||||
$embedding = $chunk->getEmbedding();
|
||||
if ($embedding) {
|
||||
$metadata = [
|
||||
'document_id' => $documentId,
|
||||
'chunk_index' => (string)$index,
|
||||
'content_length' => (string)strlen($chunk->getContent()),
|
||||
];
|
||||
|
||||
$records[] = new VectorRecord(
|
||||
id: "{$documentId}_chunk_{$index}",
|
||||
content: $chunk->getContent(),
|
||||
embedding: $embedding,
|
||||
metadata: $metadata
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $records;
|
||||
}
|
||||
|
||||
// Usage
|
||||
$records = extractAndVectorize('research_paper.pdf', 'doc_123');
|
||||
|
||||
foreach ($records as $record) {
|
||||
echo "Vector ID: " . $record->id . "\n";
|
||||
echo "Content length: " . strlen($record->content) . " characters\n";
|
||||
echo "Embedding dimension: " . count($record->embedding) . "\n";
|
||||
echo "---\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
19
docs/snippets/php/api/batch_extract_bytes_sync.md
Normal file
19
docs/snippets/php/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\BatchBytesItem;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$items = [
|
||||
new BatchBytesItem('Hello, world!', 'text/plain'),
|
||||
new BatchBytesItem("# Heading\n\nParagraph text.", 'text/markdown'),
|
||||
];
|
||||
$results = Kreuzberg::batchExtractBytesSync($items, $config);
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "Item $i: " . strlen($result->getContent()) . " chars\n";
|
||||
}
|
||||
```
|
||||
20
docs/snippets/php/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/php/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\BatchFileItem;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$items = [
|
||||
new BatchFileItem('doc1.pdf'),
|
||||
new BatchFileItem('doc2.docx'),
|
||||
new BatchFileItem('report.pdf'),
|
||||
];
|
||||
$results = Kreuzberg::batchExtractFilesSync($items, $config);
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "File $i: " . strlen($result->getContent()) . " chars\n";
|
||||
}
|
||||
```
|
||||
37
docs/snippets/php/api/client_chunk_text.md
Normal file
37
docs/snippets/php/api/client_chunk_text.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
|
||||
$client = new Client();
|
||||
$filePath = 'document.pdf';
|
||||
$fileContent = file_get_contents($filePath);
|
||||
|
||||
try {
|
||||
$response = $client->post('http://localhost:8000/extract', [
|
||||
'multipart' => [
|
||||
[
|
||||
'name' => 'file',
|
||||
'contents' => $fileContent,
|
||||
'filename' => basename($filePath),
|
||||
'headers' => ['Content-Type' => 'application/pdf'],
|
||||
],
|
||||
[
|
||||
'name' => 'chunking',
|
||||
'contents' => json_encode(['max_characters' => 800, 'overlap' => 100]),
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$result = json_decode((string)$response->getBody(), true);
|
||||
if (isset($result['chunks']) && is_array($result['chunks'])) {
|
||||
echo count($result['chunks']) . " chunks\n";
|
||||
foreach ($result['chunks'] as $chunk) {
|
||||
echo " " . strlen($chunk['content'] ?? '') . " chars\n";
|
||||
}
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
echo "Request failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
```
|
||||
28
docs/snippets/php/api/client_extract_single_file.md
Normal file
28
docs/snippets/php/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
|
||||
$client = new Client();
|
||||
$filePath = 'document.pdf';
|
||||
$fileContent = file_get_contents($filePath);
|
||||
|
||||
try {
|
||||
$response = $client->post('http://localhost:8000/extract', [
|
||||
'multipart' => [
|
||||
[
|
||||
'name' => 'file',
|
||||
'contents' => $fileContent,
|
||||
'filename' => basename($filePath),
|
||||
'headers' => ['Content-Type' => 'application/pdf'],
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$result = json_decode((string)$response->getBody(), true);
|
||||
echo $result['content'] ?? '';
|
||||
} catch (Exception $e) {
|
||||
echo "Request failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
```
|
||||
80
docs/snippets/php/api/combining_all_features.md
Normal file
80
docs/snippets/php/api/combining_all_features.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\ChunkSizing;
|
||||
use Kreuzberg\ImageExtractionConfig;
|
||||
use Kreuzberg\OutputFormat;
|
||||
|
||||
// Build config with OCR, chunking, and image extraction
|
||||
$config = new ExtractionConfig(
|
||||
null, // caching
|
||||
false, // force_ocr
|
||||
null, // max_concurrent_extractions
|
||||
null, // cache_dir
|
||||
OutputFormat::Markdown, // output_format
|
||||
true, // include_document_structure
|
||||
true, // enable_quality_processing
|
||||
true, // use_cache
|
||||
null, // use_diffs
|
||||
null, // keep_empty_chunks
|
||||
);
|
||||
|
||||
// Set OCR: Tesseract with English language
|
||||
$ocrConfig = new OcrConfig(
|
||||
'tesseract', // backend
|
||||
'eng', // language
|
||||
null, // page_count_hint
|
||||
null, // psm_mode
|
||||
null, // use_gpu
|
||||
null, // languages
|
||||
null, // fast_mode
|
||||
null, // fast_weight
|
||||
null, // min_confidence
|
||||
);
|
||||
$config->setOcr($ocrConfig);
|
||||
|
||||
// Set chunking: semantic markdown chunks ~800 chars, 100-char overlap
|
||||
$chunkingConfig = new ChunkingConfig(
|
||||
800, // max_characters
|
||||
100, // overlap
|
||||
true, // trim
|
||||
'Markdown', // chunker_type
|
||||
null, // preset
|
||||
true, // prepend_heading_context
|
||||
null, // topic_threshold
|
||||
);
|
||||
$config->setChunking($chunkingConfig);
|
||||
|
||||
// Set image extraction
|
||||
$imageConfig = new ImageExtractionConfig(
|
||||
true, // extract_images
|
||||
null, // image_min_width
|
||||
null, // image_min_height
|
||||
null, // image_output_format
|
||||
null, // image_compression_level
|
||||
);
|
||||
$config->setImages($imageConfig);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('report.pdf', null, $config);
|
||||
|
||||
echo "Content (" . strlen($result->getContent()) . " chars):\n";
|
||||
echo substr($result->getContent(), 0, 200) . "\n\n";
|
||||
|
||||
if ($result->getChunks() !== null) {
|
||||
echo "Chunks: " . count($result->getChunks()) . "\n";
|
||||
}
|
||||
echo "Tables: " . count($result->getTables()) . "\n";
|
||||
|
||||
if ($result->getDetectedLanguages() !== null) {
|
||||
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
|
||||
if ($result->getExtractionMethod() !== null) {
|
||||
echo "Extraction method: " . $result->getExtractionMethod() . "\n";
|
||||
}
|
||||
```
|
||||
19
docs/snippets/php/api/error_handling.md
Normal file
19
docs/snippets/php/api/error_handling.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KreuzbergException;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
try {
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
echo $result->getContent();
|
||||
} catch (KreuzbergException $e) {
|
||||
// The extension throws KreuzbergException with the error message
|
||||
// Error context is available in the exception message
|
||||
echo "Extraction failed: " . $e->getMessage() . "\n";
|
||||
echo "Error code: " . $e->getCode() . "\n";
|
||||
}
|
||||
```
|
||||
31
docs/snippets/php/api/error_handling_extract.md
Normal file
31
docs/snippets/php/api/error_handling_extract.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KreuzbergException;
|
||||
|
||||
function extract_text(string $bytes, string $mime_type): string {
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractBytesSync($bytes, $mime_type, $config);
|
||||
return $result->getContent();
|
||||
}
|
||||
|
||||
$bytes = file_get_contents('document.pdf') ?: '';
|
||||
try {
|
||||
$text = extract_text($bytes, 'application/pdf');
|
||||
echo "Extracted " . strlen($text) . " chars\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
// All Kreuzberg errors are KreuzbergException
|
||||
// Check the message for error type details
|
||||
$message = $e->getMessage();
|
||||
if (strpos($message, 'not supported') !== false) {
|
||||
echo "Format not supported\n";
|
||||
} elseif (strpos($message, 'OCR') !== false) {
|
||||
echo "OCR failed: " . $message . "\n";
|
||||
} else {
|
||||
echo "Error: " . $message . "\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/php/api/extract_bytes_async.md
Normal file
21
docs/snippets/php/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// PHP does not have native async/await. The ext-php-rs binding blocks internally
|
||||
// using tokio::task::block_on. For concurrent operations, use batchExtractBytesSync
|
||||
// or batchExtractBytesAsync with multiple items instead.
|
||||
|
||||
$content = file_get_contents('document.pdf');
|
||||
$config = new ExtractionConfig();
|
||||
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
|
||||
$result = Kreuzberg::extractBytesAsync($content, 'application/pdf', $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
14
docs/snippets/php/api/extract_bytes_sync.md
Normal file
14
docs/snippets/php/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$content = file_get_contents('document.pdf');
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractBytesSync($content, 'application/pdf', $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
20
docs/snippets/php/api/extract_file_async.md
Normal file
20
docs/snippets/php/api/extract_file_async.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// PHP does not have native async/await. The ext-php-rs binding blocks internally
|
||||
// using tokio::task::block_on. This behaves like the sync version in PHP.
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
|
||||
$result = Kreuzberg::extractFileAsync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'MIME type: ' . $result->getMimeType() . "\n";
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
14
docs/snippets/php/api/extract_file_sync.md
Normal file
14
docs/snippets/php/api/extract_file_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'MIME type: ' . $result->getMimeType() . "\n";
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
27
docs/snippets/php/async/async_amp_bridge.md
Normal file
27
docs/snippets/php/async/async_amp_bridge.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```php title="PHP (Amp v3+)"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
// Requires: composer require amphp/amp ^3.0
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Async\AmpBridge;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
// Single file extraction with Amp Future
|
||||
$deferred = $kreuzberg->extractFileAsync('document.pdf');
|
||||
$future = AmpBridge::toFuture($deferred);
|
||||
$result = $future->await();
|
||||
echo $result->content;
|
||||
|
||||
// Batch extraction with Amp Future
|
||||
$files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
||||
$batchDeferred = $kreuzberg->batchExtractFilesAsync($files);
|
||||
$batchFuture = AmpBridge::toBatchFuture($batchDeferred);
|
||||
$results = $batchFuture->await();
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "{$files[$i]}: {$result->content}\n";
|
||||
}
|
||||
```
|
||||
40
docs/snippets/php/async/async_batch.md
Normal file
40
docs/snippets/php/async/async_batch.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use function Kreuzberg\batch_extract_files_async;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
// Async batch file extraction
|
||||
$files = ['doc1.pdf', 'doc2.docx', 'doc3.xlsx'];
|
||||
$deferred = $kreuzberg->batchExtractFilesAsync($files);
|
||||
|
||||
// Do other work while extraction runs...
|
||||
processOtherTasks();
|
||||
|
||||
// Block until all results are ready
|
||||
$results = $deferred->getResults();
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "{$files[$i]}: " . strlen($result->content) . " chars\n";
|
||||
}
|
||||
|
||||
// With timeout
|
||||
$deferred = $kreuzberg->batchExtractFilesAsync($files);
|
||||
$results = $deferred->waitBatch(10000); // 10 second timeout
|
||||
|
||||
if ($results !== null) {
|
||||
foreach ($results as $result) {
|
||||
echo $result->content . "\n";
|
||||
}
|
||||
} else {
|
||||
echo "Batch extraction timed out\n";
|
||||
}
|
||||
|
||||
// Procedural API
|
||||
$deferred = batch_extract_files_async($files);
|
||||
$results = $deferred->getResults();
|
||||
```
|
||||
47
docs/snippets/php/async/async_extract_file.md
Normal file
47
docs/snippets/php/async/async_extract_file.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
use function Kreuzberg\extract_file_async;
|
||||
|
||||
// OOP API: async file extraction
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$deferred = $kreuzberg->extractFileAsync('document.pdf');
|
||||
|
||||
// Non-blocking: check if ready
|
||||
if ($deferred->isReady()) {
|
||||
$result = $deferred->getResult();
|
||||
echo $result->content;
|
||||
}
|
||||
|
||||
// Non-blocking: try to get result (returns null if pending)
|
||||
$result = $deferred->tryGetResult();
|
||||
if ($result !== null) {
|
||||
echo $result->content;
|
||||
}
|
||||
|
||||
// Blocking: wait until ready
|
||||
$result = $deferred->getResult();
|
||||
echo $result->content;
|
||||
|
||||
// Blocking with timeout (5 seconds)
|
||||
$result = $deferred->wait(5000);
|
||||
if ($result !== null) {
|
||||
echo $result->content;
|
||||
} else {
|
||||
echo "Extraction timed out\n";
|
||||
}
|
||||
|
||||
// Procedural API
|
||||
$deferred = extract_file_async('document.pdf');
|
||||
$result = $deferred->getResult();
|
||||
echo $result->content;
|
||||
|
||||
// Static API
|
||||
$deferred = Kreuzberg::extractFileAsyncStatic('document.pdf');
|
||||
$result = $deferred->getResult();
|
||||
echo $result->content;
|
||||
```
|
||||
25
docs/snippets/php/async/async_react_bridge.md
Normal file
25
docs/snippets/php/async/async_react_bridge.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```php title="PHP (ReactPHP)"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
// Requires: composer require react/promise ^3.0 react/event-loop ^1.0
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Async\ReactBridge;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
// Single file extraction with ReactPHP Promise
|
||||
$deferred = $kreuzberg->extractFileAsync('document.pdf');
|
||||
$promise = ReactBridge::toPromise($deferred);
|
||||
|
||||
$promise->then(
|
||||
function ($result) {
|
||||
echo "Content: {$result->content}\n";
|
||||
echo "MIME type: {$result->mimeType}\n";
|
||||
},
|
||||
function (\Throwable $error) {
|
||||
echo "Extraction failed: {$error->getMessage()}\n";
|
||||
}
|
||||
);
|
||||
```
|
||||
224
docs/snippets/php/benchmarking/simple_benchmark.php
Normal file
224
docs/snippets/php/benchmarking/simple_benchmark.php
Normal file
@@ -0,0 +1,224 @@
|
||||
```php title="simple_benchmark.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Simple Benchmarking
|
||||
*
|
||||
* Benchmark document extraction performance across different
|
||||
* file types, sizes, and configurations.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
class Benchmark
|
||||
{
|
||||
private array $results = [];
|
||||
|
||||
public function run(string $name, callable $fn, int $iterations = 1): void
|
||||
{
|
||||
$times = [];
|
||||
$memories = [];
|
||||
|
||||
for ($i = 0; $i < $iterations; $i++) {
|
||||
gc_collect_cycles();
|
||||
$startMemory = memory_get_usage();
|
||||
$startTime = microtime(true);
|
||||
|
||||
$fn();
|
||||
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
$memoryUsed = memory_get_usage() - $startMemory;
|
||||
|
||||
$times[] = $elapsed;
|
||||
$memories[] = $memoryUsed;
|
||||
}
|
||||
|
||||
$this->results[$name] = [
|
||||
'iterations' => $iterations,
|
||||
'avg_time' => array_sum($times) / count($times),
|
||||
'min_time' => min($times),
|
||||
'max_time' => max($times),
|
||||
'avg_memory' => array_sum($memories) / count($memories),
|
||||
'peak_memory' => memory_get_peak_usage(),
|
||||
];
|
||||
}
|
||||
|
||||
public function report(): void
|
||||
{
|
||||
echo "Benchmark Results:\n";
|
||||
echo str_repeat('=', 80) . "\n\n";
|
||||
|
||||
foreach ($this->results as $name => $stats) {
|
||||
echo "$name:\n";
|
||||
echo " Iterations: {$stats['iterations']}\n";
|
||||
echo " Average time: " . number_format($stats['avg_time'], 4) . "s\n";
|
||||
echo " Min time: " . number_format($stats['min_time'], 4) . "s\n";
|
||||
echo " Max time: " . number_format($stats['max_time'], 4) . "s\n";
|
||||
echo " Average memory: " . number_format($stats['avg_memory'] / 1024 / 1024, 2) . " MB\n";
|
||||
echo " Peak memory: " . number_format($stats['peak_memory'] / 1024 / 1024, 2) . " MB\n";
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
|
||||
public function compare(): void
|
||||
{
|
||||
if (count($this->results) < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
echo "Performance Comparison:\n";
|
||||
echo str_repeat('=', 80) . "\n\n";
|
||||
|
||||
$baseline = array_values($this->results)[0];
|
||||
$baselineName = array_keys($this->results)[0];
|
||||
|
||||
foreach ($this->results as $name => $stats) {
|
||||
if ($name === $baselineName) continue;
|
||||
|
||||
$speedup = $baseline['avg_time'] / $stats['avg_time'];
|
||||
$memoryRatio = $stats['avg_memory'] / $baseline['avg_memory'];
|
||||
|
||||
echo "$name vs $baselineName:\n";
|
||||
echo " Speed: " . number_format($speedup, 2) . "x ";
|
||||
echo ($speedup > 1 ? "faster" : "slower") . "\n";
|
||||
echo " Memory: " . number_format($memoryRatio, 2) . "x ";
|
||||
echo ($memoryRatio < 1 ? "less" : "more") . "\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$benchmark = new Benchmark();
|
||||
|
||||
$testFile = 'test_document.pdf';
|
||||
if (file_exists($testFile)) {
|
||||
$benchmark->run('Simple PDF extraction', function () use ($testFile) {
|
||||
extract_file($testFile);
|
||||
}, 5);
|
||||
}
|
||||
|
||||
if (file_exists($testFile)) {
|
||||
$benchmark->run('PDF with table extraction', function () use ($testFile) {
|
||||
$config = new ExtractionConfig(extractTables: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$kreuzberg->extractFile($testFile);
|
||||
}, 5);
|
||||
}
|
||||
|
||||
if (file_exists($testFile)) {
|
||||
$benchmark->run('PDF with OCR', function () use ($testFile) {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$kreuzberg->extractFile($testFile);
|
||||
}, 3);
|
||||
}
|
||||
|
||||
$files = array_filter(['doc1.pdf', 'doc2.pdf', 'doc3.pdf'], 'file_exists');
|
||||
if (count($files) >= 3) {
|
||||
$benchmark->run('Batch processing (3 files)', function () use ($files) {
|
||||
batch_extract_files(array_slice($files, 0, 3));
|
||||
}, 3);
|
||||
|
||||
$benchmark->run('Sequential processing (3 files)', function () use ($files) {
|
||||
foreach (array_slice($files, 0, 3) as $file) {
|
||||
extract_file($file);
|
||||
}
|
||||
}, 3);
|
||||
}
|
||||
|
||||
$fileTypes = [
|
||||
'PDF' => 'sample.pdf',
|
||||
'DOCX' => 'sample.docx',
|
||||
'XLSX' => 'sample.xlsx',
|
||||
'TXT' => 'sample.txt',
|
||||
];
|
||||
|
||||
foreach ($fileTypes as $type => $file) {
|
||||
if (file_exists($file)) {
|
||||
$benchmark->run("$type extraction", function () use ($file) {
|
||||
extract_file($file);
|
||||
}, 5);
|
||||
}
|
||||
}
|
||||
|
||||
$configs = [
|
||||
'Minimal' => new ExtractionConfig(
|
||||
extractTables: false,
|
||||
extractImages: false
|
||||
),
|
||||
'Standard' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: false
|
||||
),
|
||||
'Full' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: true,
|
||||
preserveFormatting: true
|
||||
),
|
||||
];
|
||||
|
||||
foreach ($configs as $name => $config) {
|
||||
if (file_exists($testFile)) {
|
||||
$benchmark->run("$name config", function () use ($testFile, $config) {
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$kreuzberg->extractFile($testFile);
|
||||
}, 5);
|
||||
}
|
||||
}
|
||||
|
||||
$benchmark->report();
|
||||
$benchmark->compare();
|
||||
|
||||
echo "\nThroughput Test:\n";
|
||||
echo str_repeat('=', 80) . "\n";
|
||||
|
||||
if (!empty($files)) {
|
||||
$start = microtime(true);
|
||||
$count = 0;
|
||||
|
||||
foreach ($files as $file) {
|
||||
extract_file($file);
|
||||
$count++;
|
||||
}
|
||||
|
||||
$elapsed = microtime(true) - $start;
|
||||
$throughput = $count / $elapsed;
|
||||
|
||||
echo "Processed $count files in " . number_format($elapsed, 2) . " seconds\n";
|
||||
echo "Throughput: " . number_format($throughput, 2) . " files/second\n";
|
||||
}
|
||||
|
||||
echo "\nMemory Stress Test:\n";
|
||||
echo str_repeat('=', 80) . "\n";
|
||||
|
||||
$initialMemory = memory_get_usage();
|
||||
$results = [];
|
||||
|
||||
for ($i = 0; $i < 10; $i++) {
|
||||
if (file_exists($testFile)) {
|
||||
$results[] = extract_file($testFile);
|
||||
}
|
||||
}
|
||||
|
||||
$finalMemory = memory_get_usage();
|
||||
$memoryGrowth = $finalMemory - $initialMemory;
|
||||
|
||||
echo "Processed 10 documents\n";
|
||||
echo "Memory growth: " . number_format($memoryGrowth / 1024 / 1024, 2) . " MB\n";
|
||||
echo "Average per document: " . number_format($memoryGrowth / 10 / 1024 / 1024, 2) . " MB\n";
|
||||
|
||||
unset($results);
|
||||
gc_collect_cycles();
|
||||
|
||||
$afterCleanup = memory_get_usage();
|
||||
echo "After cleanup: " . number_format($afterCleanup / 1024 / 1024, 2) . " MB\n";
|
||||
```
|
||||
229
docs/snippets/php/cache/disk_cache.php
vendored
Normal file
229
docs/snippets/php/cache/disk_cache.php
vendored
Normal file
@@ -0,0 +1,229 @@
|
||||
```php title="disk_cache.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Disk Cache for Document Extraction
|
||||
*
|
||||
* Implement file-based caching to avoid re-processing the same documents.
|
||||
* Significantly improves performance for repeated extractions.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Types\ExtractionResult;
|
||||
|
||||
class DiskCache
|
||||
{
|
||||
private string $cacheDir;
|
||||
private int $ttl;
|
||||
|
||||
public function __construct(string $cacheDir = null, int $ttl = 7 * 86400)
|
||||
{
|
||||
$this->cacheDir = $cacheDir ?? sys_get_temp_dir() . '/kreuzberg_cache';
|
||||
$this->ttl = $ttl;
|
||||
|
||||
if (!is_dir($this->cacheDir)) {
|
||||
mkdir($this->cacheDir, 0755, true);
|
||||
}
|
||||
}
|
||||
|
||||
private function getCacheKey(string $filePath, ExtractionConfig $config): string
|
||||
{
|
||||
$fileHash = md5_file($filePath);
|
||||
$configHash = md5(json_encode($config->toArray()));
|
||||
return md5($filePath . $fileHash . $configHash);
|
||||
}
|
||||
|
||||
private function getCachePath(string $key): string
|
||||
{
|
||||
return $this->cacheDir . '/' . $key . '.cache';
|
||||
}
|
||||
|
||||
public function get(string $filePath, ExtractionConfig $config): ?ExtractionResult
|
||||
{
|
||||
$key = $this->getCacheKey($filePath, $config);
|
||||
$cachePath = $this->getCachePath($key);
|
||||
|
||||
if (!file_exists($cachePath)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (time() - filemtime($cachePath) > $this->ttl) {
|
||||
unlink($cachePath);
|
||||
return null;
|
||||
}
|
||||
|
||||
$data = file_get_contents($cachePath);
|
||||
if ($data === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$cached = unserialize($data);
|
||||
if ($cached instanceof ExtractionResult) {
|
||||
return $cached;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function set(string $filePath, ExtractionConfig $config, ExtractionResult $result): void
|
||||
{
|
||||
$key = $this->getCacheKey($filePath, $config);
|
||||
$cachePath = $this->getCachePath($key);
|
||||
|
||||
file_put_contents($cachePath, serialize($result));
|
||||
}
|
||||
|
||||
public function clear(): void
|
||||
{
|
||||
$files = glob($this->cacheDir . '/*.cache');
|
||||
foreach ($files as $file) {
|
||||
unlink($file);
|
||||
}
|
||||
}
|
||||
|
||||
public function getStats(): array
|
||||
{
|
||||
$files = glob($this->cacheDir . '/*.cache');
|
||||
$totalSize = 0;
|
||||
|
||||
foreach ($files as $file) {
|
||||
$totalSize += filesize($file);
|
||||
}
|
||||
|
||||
return [
|
||||
'total_entries' => count($files),
|
||||
'cache_size_bytes' => $totalSize,
|
||||
'cache_dir' => $this->cacheDir,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$cache = new DiskCache();
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$config = new ExtractionConfig();
|
||||
|
||||
$file = 'document.pdf';
|
||||
|
||||
echo "First extraction (will be cached)...\n";
|
||||
$start = microtime(true);
|
||||
|
||||
$result = $cache->get($file, $config);
|
||||
|
||||
if ($result === null) {
|
||||
$result = $kreuzberg->extractFile($file, config: $config);
|
||||
$cache->set($file, $config, $result);
|
||||
echo " Status: Extracted and cached\n";
|
||||
} else {
|
||||
echo " Status: Retrieved from cache\n";
|
||||
}
|
||||
|
||||
$elapsed = microtime(true) - $start;
|
||||
echo " Time: " . number_format($elapsed, 4) . "s\n";
|
||||
echo " Content length: " . strlen($result->content) . " chars\n\n";
|
||||
|
||||
echo "Second extraction (from cache)...\n";
|
||||
$start = microtime(true);
|
||||
|
||||
$result = $cache->get($file, $config);
|
||||
|
||||
if ($result === null) {
|
||||
$result = $kreuzberg->extractFile($file, config: $config);
|
||||
$cache->set($file, $config, $result);
|
||||
echo " Status: Extracted and cached\n";
|
||||
} else {
|
||||
echo " Status: Retrieved from cache\n";
|
||||
}
|
||||
|
||||
$elapsed = microtime(true) - $start;
|
||||
echo " Time: " . number_format($elapsed, 4) . "s\n";
|
||||
echo " Content length: " . strlen($result->content) . " chars\n\n";
|
||||
|
||||
$stats = $cache->getStats();
|
||||
echo "Cache Statistics:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total entries: {$stats['total_entries']}\n";
|
||||
echo "Cache size: " . number_format($stats['cache_size_bytes'] / 1024 / 1024, 2) . " MB\n";
|
||||
echo "Cache directory: {$stats['cache_dir']}\n\n";
|
||||
|
||||
class CachedKreuzberg
|
||||
{
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
private DiskCache $cache
|
||||
) {}
|
||||
|
||||
public function extractFile(
|
||||
string $filePath,
|
||||
?string $mimeType = null,
|
||||
?ExtractionConfig $config = null
|
||||
): ExtractionResult {
|
||||
$config = $config ?? new ExtractionConfig();
|
||||
|
||||
$result = $this->cache->get($filePath, $config);
|
||||
|
||||
if ($result === null) {
|
||||
$result = $this->kreuzberg->extractFile($filePath, $mimeType, $config);
|
||||
$this->cache->set($filePath, $config, $result);
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function clearCache(): void
|
||||
{
|
||||
$this->cache->clear();
|
||||
}
|
||||
|
||||
public function getCacheStats(): array
|
||||
{
|
||||
return $this->cache->getStats();
|
||||
}
|
||||
}
|
||||
|
||||
$cachedKreuzberg = new CachedKreuzberg(
|
||||
new Kreuzberg(),
|
||||
new DiskCache()
|
||||
);
|
||||
|
||||
echo "Using CachedKreuzberg wrapper:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
|
||||
foreach ($files as $file) {
|
||||
if (!file_exists($file)) continue;
|
||||
|
||||
$start = microtime(true);
|
||||
$result = $cachedKreuzberg->extractFile($file);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "$file: " . number_format($elapsed, 4) . "s\n";
|
||||
}
|
||||
|
||||
echo "\nCache stats:\n";
|
||||
$stats = $cachedKreuzberg->getCacheStats();
|
||||
print_r($stats);
|
||||
|
||||
function cleanupCache(DiskCache $cache, int $maxAge = 7 * 86400): int
|
||||
{
|
||||
$cacheDir = $cache->getStats()['cache_dir'];
|
||||
$files = glob($cacheDir . '/*.cache');
|
||||
$deleted = 0;
|
||||
|
||||
foreach ($files as $file) {
|
||||
if (time() - filemtime($file) > $maxAge) {
|
||||
unlink($file);
|
||||
$deleted++;
|
||||
}
|
||||
}
|
||||
|
||||
return $deleted;
|
||||
}
|
||||
|
||||
$deleted = cleanupCache($cache, 7 * 86400);
|
||||
echo "\nCleaned up $deleted old cache entries\n";
|
||||
```
|
||||
151
docs/snippets/php/chunking/basic_chunking.php
Normal file
151
docs/snippets/php/chunking/basic_chunking.php
Normal file
@@ -0,0 +1,151 @@
|
||||
```php title="basic_chunking.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic Text Chunking
|
||||
*
|
||||
* Split documents into smaller chunks for RAG (Retrieval Augmented Generation),
|
||||
* vector databases, and context-aware processing.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('long_document.pdf');
|
||||
|
||||
echo "Document Chunking Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total chunks: " . count($result->chunks ?? []) . "\n";
|
||||
echo "Total content length: " . strlen($result->content) . "\n\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
echo "Chunk {$chunk->metadata->chunkIndex}:\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
echo "Length: " . strlen($chunk->content) . " chars\n";
|
||||
echo "Content: " . substr($chunk->content, 0, 100) . "...\n\n";
|
||||
}
|
||||
|
||||
$sizes = [
|
||||
'Small (256)' => 256,
|
||||
'Medium (512)' => 512,
|
||||
'Large (1024)' => 1024,
|
||||
'XLarge (2048)' => 2048,
|
||||
];
|
||||
|
||||
foreach ($sizes as $name => $size) {
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: $size,
|
||||
chunkOverlap: (int)($size * 0.1)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "$name chunks:\n";
|
||||
echo " Total: " . count($result->chunks ?? []) . "\n";
|
||||
echo " Avg size: " . number_format(
|
||||
array_sum(array_map(
|
||||
fn($c) => strlen($c->content),
|
||||
$result->chunks ?? []
|
||||
)) / count($result->chunks ?? [1])
|
||||
) . " chars\n\n";
|
||||
}
|
||||
|
||||
$sentenceConfig = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true,
|
||||
respectParagraphs: false
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($sentenceConfig);
|
||||
$result = $kreuzberg->extractFile('article.pdf');
|
||||
|
||||
echo "Sentence-respecting chunks:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
$sentences = preg_match_all('/[.!?]+/', $chunk->content);
|
||||
echo "Chunk {$chunk->metadata->chunkIndex}: $sentences sentences\n";
|
||||
echo " Starts with: " . substr($chunk->content, 0, 50) . "...\n";
|
||||
echo " Ends with: ..." . substr($chunk->content, -50) . "\n\n";
|
||||
}
|
||||
|
||||
$paragraphConfig = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 1000,
|
||||
chunkOverlap: 100,
|
||||
respectSentences: true,
|
||||
respectParagraphs: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($paragraphConfig);
|
||||
$result = $kreuzberg->extractFile('essay.pdf');
|
||||
|
||||
echo "Paragraph-respecting chunks:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
$paragraphs = substr_count($chunk->content, "\n\n");
|
||||
echo "Chunk {$chunk->metadata->chunkIndex}: ~$paragraphs paragraphs\n";
|
||||
echo " " . strlen($chunk->content) . " characters\n\n";
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('knowledge_base.pdf');
|
||||
|
||||
$chunksForDb = [];
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
$chunksForDb[] = [
|
||||
'id' => uniqid('chunk_', true),
|
||||
'document_id' => 'doc_' . md5($result->content),
|
||||
'chunk_index' => $chunk->metadata->chunkIndex,
|
||||
'content' => $chunk->content,
|
||||
'length' => strlen($chunk->content),
|
||||
'metadata' => [
|
||||
'source_file' => 'knowledge_base.pdf',
|
||||
'mime_type' => $result->mimeType,
|
||||
'created_at' => date('Y-m-d H:i:s'),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
echo "Prepared " . count($chunksForDb) . " chunks for database:\n";
|
||||
foreach (array_slice($chunksForDb, 0, 3) as $chunk) {
|
||||
echo " ID: {$chunk['id']}\n";
|
||||
echo " Index: {$chunk['chunk_index']}\n";
|
||||
echo " Length: {$chunk['length']} chars\n\n";
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
'chunks.json',
|
||||
json_encode($chunksForDb, JSON_PRETTY_PRINT)
|
||||
);
|
||||
echo "Saved chunks to: chunks.json\n";
|
||||
```
|
||||
66
docs/snippets/php/cli/basic_cli.php
Normal file
66
docs/snippets/php/cli/basic_cli.php
Normal file
@@ -0,0 +1,66 @@
|
||||
```php title="basic_cli.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic CLI Usage
|
||||
*
|
||||
* Simple command-line interface for document extraction.
|
||||
* Process documents from the terminal.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$options = getopt('f:o:h', ['file:', 'output:', 'help']);
|
||||
|
||||
if (isset($options['h']) || isset($options['help']) || empty($argv[1])) {
|
||||
echo "Kreuzberg Document Extraction CLI\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
echo "Usage: php basic_cli.php [options]\n\n";
|
||||
echo "Options:\n";
|
||||
echo " -f, --file <path> Input file to extract\n";
|
||||
echo " -o, --output <path> Output file (default: stdout)\n";
|
||||
echo " -h, --help Show this help message\n\n";
|
||||
echo "Examples:\n";
|
||||
echo " php basic_cli.php -f document.pdf\n";
|
||||
echo " php basic_cli.php -f report.docx -o output.txt\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
$inputFile = $options['f'] ?? $options['file'] ?? $argv[1] ?? null;
|
||||
|
||||
if ($inputFile === null || !file_exists($inputFile)) {
|
||||
fwrite(STDERR, "Error: Input file not found: $inputFile\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$outputFile = $options['o'] ?? $options['output'] ?? null;
|
||||
|
||||
try {
|
||||
fwrite(STDERR, "Extracting: $inputFile\n");
|
||||
$start = microtime(true);
|
||||
|
||||
$result = extract_file($inputFile);
|
||||
|
||||
$elapsed = microtime(true) - $start;
|
||||
fwrite(STDERR, "Extraction completed in " . number_format($elapsed, 3) . "s\n");
|
||||
fwrite(STDERR, "Content length: " . strlen($result->content) . " characters\n");
|
||||
fwrite(STDERR, "Tables found: " . count($result->tables) . "\n");
|
||||
|
||||
if ($outputFile) {
|
||||
file_put_contents($outputFile, $result->content);
|
||||
fwrite(STDERR, "Saved to: $outputFile\n");
|
||||
} else {
|
||||
echo $result->content;
|
||||
}
|
||||
|
||||
exit(0);
|
||||
} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
|
||||
fwrite(STDERR, "Error: " . $e->getMessage() . "\n");
|
||||
exit(1);
|
||||
}
|
||||
```
|
||||
134
docs/snippets/php/cli/cli_with_config.php
Normal file
134
docs/snippets/php/cli/cli_with_config.php
Normal file
@@ -0,0 +1,134 @@
|
||||
```php title="cli_with_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Advanced CLI with Configuration
|
||||
*
|
||||
* Command-line tool with support for various extraction options.
|
||||
* Supports OCR, tables, images, and output formats.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
|
||||
$longOpts = [
|
||||
'file:',
|
||||
'output:',
|
||||
'format:',
|
||||
'ocr',
|
||||
'ocr-lang:',
|
||||
'tables',
|
||||
'images',
|
||||
'chunks:',
|
||||
'help',
|
||||
];
|
||||
|
||||
$options = getopt('f:o:', $longOpts);
|
||||
|
||||
if (isset($options['help']) || empty($options)) {
|
||||
echo "Kreuzberg Advanced CLI\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
echo "Usage: php cli_with_config.php [options]\n\n";
|
||||
echo "Options:\n";
|
||||
echo " -f, --file <path> Input file to extract (required)\n";
|
||||
echo " -o, --output <path> Output file (default: stdout)\n";
|
||||
echo " --format <format> Output format: text, json, markdown\n";
|
||||
echo " --ocr Enable OCR for scanned documents\n";
|
||||
echo " --ocr-lang <lang> OCR language (default: eng)\n";
|
||||
echo " --tables Extract tables\n";
|
||||
echo " --images Extract images\n";
|
||||
echo " --chunks <size> Split into chunks of size\n";
|
||||
echo " --help Show this help message\n\n";
|
||||
echo "Examples:\n";
|
||||
echo " php cli_with_config.php --file scan.pdf --ocr\n";
|
||||
echo " php cli_with_config.php --file report.pdf --tables --format json\n";
|
||||
echo " php cli_with_config.php --file doc.pdf --chunks 512 --output chunks.json\n";
|
||||
exit(0);
|
||||
}
|
||||
|
||||
$inputFile = $options['file'] ?? $options['f'] ?? null;
|
||||
|
||||
if ($inputFile === null || !file_exists($inputFile)) {
|
||||
fwrite(STDERR, "Error: Input file required and must exist\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$enableOcr = isset($options['ocr']);
|
||||
$ocrLang = $options['ocr-lang'] ?? 'eng';
|
||||
$extractTables = isset($options['tables']);
|
||||
$extractImages = isset($options['images']);
|
||||
$chunkSize = isset($options['chunks']) ? (int)$options['chunks'] : null;
|
||||
$format = $options['format'] ?? 'text';
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: $enableOcr ? new OcrConfig(backend: 'tesseract', language: $ocrLang) : null,
|
||||
extractTables: $extractTables,
|
||||
extractImages: $extractImages,
|
||||
chunking: $chunkSize ? new ChunkingConfig(maxChunkSize: $chunkSize) : null,
|
||||
preserveFormatting: $format === 'markdown',
|
||||
outputFormat: $format === 'markdown' ? 'markdown' : null
|
||||
);
|
||||
|
||||
try {
|
||||
fwrite(STDERR, "Processing: $inputFile\n");
|
||||
fwrite(STDERR, "Options:\n");
|
||||
fwrite(STDERR, " OCR: " . ($enableOcr ? "enabled ($ocrLang)" : "disabled") . "\n");
|
||||
fwrite(STDERR, " Tables: " . ($extractTables ? "enabled" : "disabled") . "\n");
|
||||
fwrite(STDERR, " Images: " . ($extractImages ? "enabled" : "disabled") . "\n");
|
||||
fwrite(STDERR, " Chunks: " . ($chunkSize ?? "disabled") . "\n");
|
||||
fwrite(STDERR, " Format: $format\n\n");
|
||||
|
||||
$start = microtime(true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($inputFile);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
fwrite(STDERR, "Extraction completed in " . number_format($elapsed, 3) . "s\n");
|
||||
|
||||
$output = match ($format) {
|
||||
'json' => json_encode([
|
||||
'content' => $result->content,
|
||||
'metadata' => [
|
||||
'title' => $result->metadata->title,
|
||||
'author' => $result->metadata->author,
|
||||
'page_count' => $result->metadata->pageCount,
|
||||
],
|
||||
'tables' => array_map(fn($t) => [
|
||||
'page' => $t->pageNumber,
|
||||
'markdown' => $t->markdown,
|
||||
], $result->tables),
|
||||
'chunks' => $chunkSize ? array_map(fn($c) => [
|
||||
'index' => $c->metadata->chunkIndex,
|
||||
'content' => $c->content,
|
||||
], $result->chunks ?? []) : null,
|
||||
], JSON_PRETTY_PRINT),
|
||||
'markdown' => $result->content,
|
||||
default => $result->content,
|
||||
};
|
||||
|
||||
$outputFile = $options['output'] ?? $options['o'] ?? null;
|
||||
if ($outputFile) {
|
||||
file_put_contents($outputFile, $output);
|
||||
fwrite(STDERR, "Output written to: $outputFile\n");
|
||||
} else {
|
||||
echo $output;
|
||||
}
|
||||
|
||||
fwrite(STDERR, "\nStatistics:\n");
|
||||
fwrite(STDERR, " Content: " . strlen($result->content) . " characters\n");
|
||||
fwrite(STDERR, " Tables: " . count($result->tables) . "\n");
|
||||
fwrite(STDERR, " Images: " . count($result->images ?? []) . "\n");
|
||||
fwrite(STDERR, " Chunks: " . count($result->chunks ?? []) . "\n");
|
||||
|
||||
exit(0);
|
||||
} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
|
||||
fwrite(STDERR, "Error: " . $e->getMessage() . "\n");
|
||||
exit(1);
|
||||
}
|
||||
```
|
||||
47
docs/snippets/php/config/advanced_config.md
Normal file
47
docs/snippets/php/config/advanced_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
use Kreuzberg\PostProcessorConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
// Advanced configuration combining multiple features
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
),
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
),
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
),
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
),
|
||||
postprocessor: new PostProcessorConfig(
|
||||
enabled: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
if ($result->getDetectedLanguages()) {
|
||||
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
56
docs/snippets/php/config/chunking_config.md
Normal file
56
docs/snippets/php/config/chunking_config.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
// Basic chunking
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Number of chunks: " . count($result->getChunks()) . "\n";
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Markdown with Heading Context"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'markdown',
|
||||
prependHeadingContext: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.md', null, $config);
|
||||
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata && $metadata->getHeadingContext()) {
|
||||
$headings = $metadata->getHeadingContext()->getHeadings();
|
||||
foreach ($headings as $heading) {
|
||||
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
|
||||
}
|
||||
}
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
17
docs/snippets/php/config/config_basic.md
Normal file
17
docs/snippets/php/config/config_basic.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
13
docs/snippets/php/config/config_discover.md
Normal file
13
docs/snippets/php/config/config_discover.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Discover configuration from file system
|
||||
$config = ExtractionConfig::discover() ?? new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/config_ocr.md
Normal file
21
docs/snippets/php/config/config_ocr.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
echo "Tables detected: " . count($result->getTables()) . "\n";
|
||||
?>
|
||||
```
|
||||
29
docs/snippets/php/config/config_programmatic.md
Normal file
29
docs/snippets/php/config/config_programmatic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
tesseractConfig: new TesseractConfig(psm: 6)
|
||||
),
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
),
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
?>
|
||||
```
|
||||
16
docs/snippets/php/config/document_structure_config.md
Normal file
16
docs/snippets/php/config/document_structure_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```php title="Document Structure Config (PHP)"
|
||||
<?php
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$config = new ExtractionConfig(includeDocumentStructure: true);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', $config);
|
||||
|
||||
if ($result->document !== null) {
|
||||
foreach ($result->document->nodes as $node) {
|
||||
echo "[{$node->content->nodeType}]\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
42
docs/snippets/php/config/element_based_output.md
Normal file
42
docs/snippets/php/config/element_based_output.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```php title="Element-Based Output (PHP)"
|
||||
<?php
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Configure element-based output
|
||||
$config = new ExtractionConfig();
|
||||
$config->setOutputFormat('element_based');
|
||||
|
||||
// Extract document
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', $config);
|
||||
|
||||
// Access elements
|
||||
foreach ($result->getElements() as $element) {
|
||||
echo "Type: " . $element->getElementType() . "\n";
|
||||
echo "Text: " . substr($element->getText(), 0, 100) . "\n";
|
||||
|
||||
if ($element->getMetadata()->getPageNumber()) {
|
||||
echo "Page: " . $element->getMetadata()->getPageNumber() . "\n";
|
||||
}
|
||||
|
||||
if ($element->getMetadata()->getCoordinates()) {
|
||||
$coords = $element->getMetadata()->getCoordinates();
|
||||
echo sprintf("Coords: (%s, %s) - (%s, %s)\n",
|
||||
$coords->getLeft(), $coords->getTop(),
|
||||
$coords->getRight(), $coords->getBottom());
|
||||
}
|
||||
|
||||
echo "---\n";
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
$titles = array_filter($result->getElements(), function($e) {
|
||||
return $e->getElementType() === 'title';
|
||||
});
|
||||
|
||||
foreach ($titles as $title) {
|
||||
$level = $title->getMetadata()->getAdditional()['level'] ?? 'unknown';
|
||||
echo "[{$level}] {$title->getText()}\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
27
docs/snippets/php/config/embedding_config.md
Normal file
27
docs/snippets/php/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
batchSize: 16,
|
||||
normalize: true,
|
||||
showDownloadProgress: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Chunks with embeddings: " . count($result->getChunks()) . "\n";
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/html_output.md
Normal file
21
docs/snippets/php/config/html_output.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\HtmlOutputConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
resultFormat: 'html',
|
||||
htmlOutput: new HtmlOutputConfig(
|
||||
theme: 'github'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
// Output HTML with kb-* CSS classes
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
26
docs/snippets/php/config/keyword_extraction_config.md
Normal file
26
docs/snippets/php/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
foreach ($result->getKeywords() as $keyword) {
|
||||
echo $keyword . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
22
docs/snippets/php/config/language_detection_config.md
Normal file
22
docs/snippets/php/config/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Detected language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
24
docs/snippets/php/config/ocr_dpi_config.md
Normal file
24
docs/snippets/php/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ImageExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
images: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
maxImageDimension: 4096,
|
||||
autoAdjustDpi: true,
|
||||
minDpi: 150,
|
||||
maxDpi: 600
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Extracted images: " . count($result->getImages()) . "\n";
|
||||
?>
|
||||
```
|
||||
33
docs/snippets/php/config/pdf_config.md
Normal file
33
docs/snippets/php/config/pdf_config.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
/**
|
||||
* PDF configuration with hierarchy detection
|
||||
*/
|
||||
$config = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
extractMetadata: true,
|
||||
passwords: ['password1', 'password2'],
|
||||
hierarchy: [
|
||||
'enabled' => true,
|
||||
'k_clusters' => 6,
|
||||
'include_bbox' => true,
|
||||
'ocr_coverage_threshold' => 0.5
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$result = extract_file('document.pdf', config: $config);
|
||||
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
echo "Metadata: " . implode(', ', array_keys((array) $result->metadata)) . "\n";
|
||||
```
|
||||
26
docs/snippets/php/config/pdf_hierarchy_config.md
Normal file
26
docs/snippets/php/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PdfConfig;
|
||||
use Kreuzberg\HierarchyConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
pdfOptions: new PdfConfig(
|
||||
hierarchy: new HierarchyConfig(
|
||||
enabled: true,
|
||||
detectionThreshold: 0.75,
|
||||
ocrCoverageThreshold: 0.8,
|
||||
minLevel: 1,
|
||||
maxLevel: 5
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Hierarchy levels: " . count($result->getHierarchy()) . "\n";
|
||||
?>
|
||||
```
|
||||
23
docs/snippets/php/config/postprocessor_config.md
Normal file
23
docs/snippets/php/config/postprocessor_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PostProcessorConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
postprocessor: new PostProcessorConfig(
|
||||
enabled: true,
|
||||
enabledProcessors: [
|
||||
'whitespace_normalizer',
|
||||
'unicode_normalizer'
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Processed content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
20
docs/snippets/php/config/quality_processing_config.md
Normal file
20
docs/snippets/php/config/quality_processing_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true,
|
||||
useCache: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Quality score: " . $result->getQualityScore() . "\n";
|
||||
if ($result->getProcessingTime()) {
|
||||
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
25
docs/snippets/php/config/tesseract_config.md
Normal file
25
docs/snippets/php/config/tesseract_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
|
||||
|
||||
echo "OCR text: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/token_reduction_config.md
Normal file
21
docs/snippets/php/config/token_reduction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Original token count: " . $result->getTokenCount() . "\n";
|
||||
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
207
docs/snippets/php/configuration/chunking_config.php
Normal file
207
docs/snippets/php/configuration/chunking_config.php
Normal file
@@ -0,0 +1,207 @@
|
||||
```php title="chunking_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Text Chunking Configuration
|
||||
*
|
||||
* This example demonstrates how to configure text chunking for RAG (Retrieval-Augmented Generation)
|
||||
* applications. Chunking splits long documents into smaller, semantically meaningful segments.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
|
||||
echo "Example 1: Basic Chunking\n";
|
||||
echo "=========================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig()
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config1);
|
||||
$result = $kreuzberg->extractFile('long_document.pdf');
|
||||
|
||||
if ($result->chunks !== null) {
|
||||
echo "Total chunks: " . count($result->chunks) . "\n";
|
||||
foreach ($result->chunks as $i => $chunk) {
|
||||
echo "\nChunk {$i}:\n";
|
||||
echo "- Text length: {$chunk->metadata->charCount} characters\n";
|
||||
echo "- Byte range: {$chunk->metadata->byteStart}-{$chunk->metadata->byteEnd}\n";
|
||||
if ($chunk->metadata->firstPage !== null) {
|
||||
echo "- Pages: {$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 2: Custom Chunk Size (Small chunks for fine-grained retrieval)\n";
|
||||
echo "======================================================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 256,
|
||||
chunkOverlap: 25,
|
||||
respectSentences: true,
|
||||
respectParagraphs: false
|
||||
)
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
|
||||
echo "Chunks created: " . (isset($result2->chunks) ? count($result2->chunks) : 0) . "\n\n";
|
||||
|
||||
echo "Example 3: Large Chunks (More context per chunk)\n";
|
||||
echo "================================================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 2000,
|
||||
chunkOverlap: 200,
|
||||
respectSentences: true,
|
||||
respectParagraphs: true
|
||||
)
|
||||
);
|
||||
|
||||
$result3 = (new Kreuzberg($config3))->extractFile('document.pdf');
|
||||
echo "Chunks created: " . (isset($result3->chunks) ? count($result3->chunks) : 0) . "\n\n";
|
||||
|
||||
echo "Example 4: RAG-Optimized Configuration\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true,
|
||||
respectParagraphs: false
|
||||
)
|
||||
);
|
||||
|
||||
$result4 = (new Kreuzberg($config4))->extractFile('document.pdf');
|
||||
|
||||
if ($result4->chunks !== null) {
|
||||
echo "Total chunks: " . count($result4->chunks) . "\n";
|
||||
|
||||
$chunkSizes = array_map(fn($chunk) => $chunk->metadata->charCount, $result4->chunks);
|
||||
echo "Average chunk size: " . round(array_sum($chunkSizes) / count($chunkSizes)) . " characters\n";
|
||||
echo "Min chunk size: " . min($chunkSizes) . " characters\n";
|
||||
echo "Max chunk size: " . max($chunkSizes) . " characters\n";
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 5: Processing Chunks for Vector Database\n";
|
||||
echo "================================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true
|
||||
)
|
||||
);
|
||||
|
||||
$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
|
||||
|
||||
if ($result5->chunks !== null) {
|
||||
foreach ($result5->chunks as $i => $chunk) {
|
||||
$documentId = "doc_123";
|
||||
$chunkData = [
|
||||
'document_id' => $documentId,
|
||||
'chunk_index' => $i,
|
||||
'text' => $chunk->content,
|
||||
'char_count' => $chunk->metadata->charCount,
|
||||
'byte_start' => $chunk->metadata->byteStart,
|
||||
'byte_end' => $chunk->metadata->byteEnd,
|
||||
'page_range' => $chunk->metadata->firstPage !== null
|
||||
? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
|
||||
: null,
|
||||
];
|
||||
|
||||
|
||||
echo "Prepared chunk {$i} for database insertion\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 6: Markdown Chunker with Token-Based Sizing and Heading Context\n";
|
||||
echo "========================================================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
chunkerType: 'markdown',
|
||||
sizing: [
|
||||
'type' => 'tokenizer',
|
||||
'model' => 'Xenova/gpt-4o'
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$result6 = (new Kreuzberg($config6))->extractFile('document.md');
|
||||
|
||||
if ($result6->chunks !== null) {
|
||||
echo "Total chunks: " . count($result6->chunks) . "\n";
|
||||
|
||||
foreach ($result6->chunks as $i => $chunk) {
|
||||
echo "\nChunk {$i}:\n";
|
||||
echo "- Text preview: " . substr($chunk->content, 0, 60) . "...\n";
|
||||
|
||||
if (isset($chunk->metadata->headingContext->headings)) {
|
||||
$headings = $chunk->metadata->headingContext->headings;
|
||||
echo "- Headings in context:\n";
|
||||
foreach ($headings as $heading) {
|
||||
echo " - Level {$heading->level}: {$heading->text}\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\nChunking Configuration Parameters:\n";
|
||||
echo "==================================\n";
|
||||
echo "- maxChunkSize: Maximum number of characters per chunk\n";
|
||||
echo "- chunkOverlap: Number of overlapping characters between chunks\n";
|
||||
echo "- respectSentences: Split at sentence boundaries when possible\n";
|
||||
echo "- respectParagraphs: Split at paragraph boundaries when possible\n";
|
||||
echo "- chunkerType: Type of chunker ('simple' or 'markdown')\n";
|
||||
echo "- sizing: Sizing strategy configuration\n";
|
||||
echo " - type: 'character' or 'tokenizer'\n";
|
||||
echo " - model: Tokenizer model (e.g., 'Xenova/gpt-4o')\n";
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 7: Prepend Heading Context\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
chunkerType: 'markdown',
|
||||
prependHeadingContext: true
|
||||
)
|
||||
);
|
||||
|
||||
$result7 = (new Kreuzberg($config7))->extractFile('document.md');
|
||||
|
||||
if ($result7->chunks !== null) {
|
||||
echo "Total chunks: " . count($result7->chunks) . "\n";
|
||||
|
||||
foreach ($result7->chunks as $i => $chunk) {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb,
|
||||
// e.g. "# Section > ## Subsection\n\nActual content..."
|
||||
echo "\nChunk {$i} preview: " . substr($chunk->content, 0, 80) . "...\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nBest Practices:\n";
|
||||
echo "- Use 256-512 chars for fine-grained retrieval\n";
|
||||
echo "- Use 1000-2000 chars for more context\n";
|
||||
echo "- Set overlap to ~10% of chunk size\n";
|
||||
echo "- Enable respectSentences for better coherence\n";
|
||||
echo "- Use markdown chunker for structured documents with headings\n";
|
||||
echo "- Use token-based sizing for LLM token budgets\n";
|
||||
echo "- Enable prependHeadingContext to embed heading breadcrumbs in chunk content\n";
|
||||
```
|
||||
200
docs/snippets/php/configuration/embedding_config.php
Normal file
200
docs/snippets/php/configuration/embedding_config.php
Normal file
@@ -0,0 +1,200 @@
|
||||
```php title="embedding_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Embedding Generation Configuration
|
||||
*
|
||||
* This example demonstrates how to configure embedding generation for semantic search
|
||||
* and vector database applications.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
echo "Example 1: Basic Embedding Generation\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50
|
||||
),
|
||||
embedding: new EmbeddingConfig()
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config1);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
if ($result->chunks !== null) {
|
||||
foreach ($result->chunks as $i => $chunk) {
|
||||
echo "\nChunk {$i}:\n";
|
||||
echo "- Text: " . substr($chunk->text, 0, 50) . "...\n";
|
||||
if ($chunk->embedding !== null) {
|
||||
echo "- Embedding dimension: " . count($chunk->embedding) . "\n";
|
||||
echo "- First 5 values: [" . implode(', ', array_slice($chunk->embedding, 0, 5)) . "...]\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 2: Different Embedding Models\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config2a = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
);
|
||||
|
||||
echo "Model: all-MiniLM-L6-v2\n";
|
||||
echo "- Dimensions: 384\n";
|
||||
echo "- Speed: Very Fast\n";
|
||||
echo "- Use case: General purpose, quick retrieval\n\n";
|
||||
|
||||
$config2b = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-mpnet-base-v2',
|
||||
normalize: true,
|
||||
batchSize: 16
|
||||
)
|
||||
);
|
||||
|
||||
echo "Model: all-mpnet-base-v2\n";
|
||||
echo "- Dimensions: 768\n";
|
||||
echo "- Speed: Medium\n";
|
||||
echo "- Use case: Higher quality semantic search\n\n";
|
||||
|
||||
echo "Example 3: Normalized vs Non-Normalized Embeddings\n";
|
||||
echo "==================================================\n";
|
||||
|
||||
$config3a = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
echo "Normalized embeddings:\n";
|
||||
echo "- Better for cosine similarity\n";
|
||||
echo "- Values in range [-1, 1]\n";
|
||||
echo "- Faster similarity computation\n\n";
|
||||
|
||||
$config3b = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: false
|
||||
)
|
||||
);
|
||||
|
||||
echo "Non-normalized embeddings:\n";
|
||||
echo "- Raw model output\n";
|
||||
echo "- Useful for specific distance metrics\n\n";
|
||||
|
||||
echo "Example 4: Batch Size Configuration\n";
|
||||
echo "===================================\n";
|
||||
|
||||
$config4a = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true,
|
||||
batchSize: 8
|
||||
)
|
||||
);
|
||||
|
||||
echo "Batch size: 8\n";
|
||||
echo "- Lower memory usage\n";
|
||||
echo "- Slower processing\n";
|
||||
echo "- Good for limited resources\n\n";
|
||||
|
||||
$config4b = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true,
|
||||
batchSize: 64
|
||||
)
|
||||
);
|
||||
|
||||
echo "Batch size: 64\n";
|
||||
echo "- Higher memory usage\n";
|
||||
echo "- Faster processing\n";
|
||||
echo "- Good for high-performance systems\n\n";
|
||||
|
||||
echo "Example 5: Complete RAG Pipeline\n";
|
||||
echo "================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true
|
||||
),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true,
|
||||
batchSize: 32
|
||||
)
|
||||
);
|
||||
|
||||
$result5 = (new Kreuzberg($config5))->extractFile('document.pdf');
|
||||
|
||||
if ($result5->chunks !== null) {
|
||||
echo "Processing " . count($result5->chunks) . " chunks with embeddings...\n\n";
|
||||
|
||||
$vectorDbData = [];
|
||||
foreach ($result5->chunks as $i => $chunk) {
|
||||
if ($chunk->embedding !== null) {
|
||||
$vectorDbData[] = [
|
||||
'id' => "chunk_{$i}",
|
||||
'text' => $chunk->text,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'char_count' => $chunk->metadata->charCount,
|
||||
'page_range' => $chunk->metadata->firstPage !== null
|
||||
? "{$chunk->metadata->firstPage}-{$chunk->metadata->lastPage}"
|
||||
: null,
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "Prepared " . count($vectorDbData) . " vectors for database\n";
|
||||
echo "Each vector has " . count($vectorDbData[0]['embedding']) . " dimensions\n";
|
||||
}
|
||||
|
||||
echo "\n\nEmbedding Configuration Parameters:\n";
|
||||
echo "===================================\n";
|
||||
echo "- model: Embedding model name\n";
|
||||
echo " * 'all-MiniLM-L6-v2': 384 dims, fast, general purpose\n";
|
||||
echo " * 'all-mpnet-base-v2': 768 dims, higher quality\n";
|
||||
echo "- normalize: L2 normalize embeddings (recommended: true)\n";
|
||||
echo "- batchSize: Number of chunks to process at once\n";
|
||||
echo "\nBest Practices:\n";
|
||||
echo "- Use normalized embeddings for cosine similarity\n";
|
||||
echo "- Choose batch size based on available memory\n";
|
||||
echo "- Use all-MiniLM-L6-v2 for speed, all-mpnet-base-v2 for quality\n";
|
||||
echo "- Combine with chunking for optimal RAG performance\n";
|
||||
|
||||
echo "\n\nCommon Embedding Models:\n";
|
||||
echo "========================\n";
|
||||
echo "Model | Dimensions | Speed | Use Case\n";
|
||||
echo "--------------------------|------------|----------|---------------------------\n";
|
||||
echo "all-MiniLM-L6-v2 | 384 | Fast | General purpose, QA\n";
|
||||
echo "all-mpnet-base-v2 | 768 | Medium | Better semantic search\n";
|
||||
echo "paraphrase-MiniLM-L6-v2 | 384 | Fast | Paraphrase detection\n";
|
||||
echo "paraphrase-mpnet-base-v2 | 768 | Medium | High-quality paraphrase\n";
|
||||
```
|
||||
65
docs/snippets/php/configuration/extraction_config.php
Normal file
65
docs/snippets/php/configuration/extraction_config.php
Normal file
@@ -0,0 +1,65 @@
|
||||
```php title="extraction_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* ExtractionConfig - Main Configuration
|
||||
*
|
||||
* The ExtractionConfig class is the primary configuration object that controls
|
||||
* all aspects of document extraction. It can be passed to the Kreuzberg constructor
|
||||
* or to individual extraction methods.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
preserveFormatting: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Extracted with images: " . count($result->images ?? []) . "\n";
|
||||
echo "Extracted with tables: " . count($result->tables) . "\n\n";
|
||||
|
||||
$advancedConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
),
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 95
|
||||
),
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
preserveFormatting: true,
|
||||
outputFormat: 'markdown'
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($advancedConfig);
|
||||
$result = $kreuzberg->extractFile('complex_document.pdf');
|
||||
|
||||
echo "Advanced extraction complete\n";
|
||||
echo "Content format: " . ($advancedConfig->outputFormat ?? 'plain') . "\n";
|
||||
echo "Formatting preserved: " . ($advancedConfig->preserveFormatting ? 'Yes' : 'No') . "\n";
|
||||
|
||||
$defaultConfig = new ExtractionConfig(extractTables: false);
|
||||
$kreuzberg = new Kreuzberg($defaultConfig);
|
||||
|
||||
$result1 = $kreuzberg->extractFile('doc1.pdf');
|
||||
|
||||
$overrideConfig = new ExtractionConfig(extractTables: true);
|
||||
$result2 = $kreuzberg->extractFile('doc2.pdf', config: $overrideConfig);
|
||||
|
||||
echo "\nDoc1 tables: " . count($result1->tables) . "\n";
|
||||
echo "Doc2 tables: " . count($result2->tables) . "\n";
|
||||
```
|
||||
277
docs/snippets/php/configuration/image_extraction_config.php
Normal file
277
docs/snippets/php/configuration/image_extraction_config.php
Normal file
@@ -0,0 +1,277 @@
|
||||
```php title="image_extraction_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Extraction Configuration
|
||||
*
|
||||
* This example demonstrates how to configure image extraction from documents,
|
||||
* including size filtering and OCR on extracted images.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
echo "Example 1: Basic Image Extraction\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config1);
|
||||
$result = $kreuzberg->extractFile('presentation.pptx');
|
||||
|
||||
if ($result->images !== null) {
|
||||
echo "Total images extracted: " . count($result->images) . "\n";
|
||||
foreach ($result->images as $i => $image) {
|
||||
echo "\nImage {$i}:\n";
|
||||
echo "- Format: {$image->format}\n";
|
||||
echo "- Size: {$image->width}x{$image->height} pixels\n";
|
||||
echo "- Page: {$image->pageNumber}\n";
|
||||
echo "- Data size: " . strlen($image->data) . " bytes\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 2: Image Extraction with Size Filter\n";
|
||||
echo "============================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 200,
|
||||
minHeight: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('document.pdf');
|
||||
|
||||
echo "Filtering images smaller than 200x200 pixels\n";
|
||||
if ($result2->images !== null) {
|
||||
echo "Filtered images: " . count($result2->images) . "\n";
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 3: Extract Only Large Images\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 800,
|
||||
minHeight: 600
|
||||
)
|
||||
);
|
||||
|
||||
echo "Configured to extract images >= 800x600 pixels\n";
|
||||
echo "Good for: Photos, large diagrams, full-page scans\n\n";
|
||||
|
||||
echo "Example 4: Extract All Images (Including Thumbnails)\n";
|
||||
echo "===================================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 50,
|
||||
minHeight: 50
|
||||
)
|
||||
);
|
||||
|
||||
echo "Configured to extract images >= 50x50 pixels\n";
|
||||
echo "Good for: Extracting all images including icons and thumbnails\n\n";
|
||||
|
||||
echo "Example 5: Image Extraction with OCR\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
performOcr: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100
|
||||
),
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$result5 = (new Kreuzberg($config5))->extractFile('document_with_images.pdf');
|
||||
|
||||
if ($result5->images !== null) {
|
||||
echo "Extracted " . count($result5->images) . " images with OCR:\n\n";
|
||||
|
||||
foreach ($result5->images as $i => $image) {
|
||||
echo "Image {$i} (Page {$image->pageNumber}):\n";
|
||||
echo "- Size: {$image->width}x{$image->height}\n";
|
||||
|
||||
if ($image->ocrResult !== null) {
|
||||
echo "- OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
|
||||
echo "- OCR Text Length: " . strlen($image->ocrResult->content) . " characters\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 6: Extract and Save Images to Disk\n";
|
||||
echo "=========================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 200,
|
||||
minHeight: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result6 = (new Kreuzberg($config6))->extractFile('presentation.pptx');
|
||||
|
||||
if ($result6->images !== null) {
|
||||
$outputDir = 'extracted_images';
|
||||
if (!is_dir($outputDir)) {
|
||||
mkdir($outputDir, 0755, true);
|
||||
}
|
||||
|
||||
foreach ($result6->images as $i => $image) {
|
||||
$filename = "{$outputDir}/image_{$i}_page_{$image->pageNumber}.{$image->format}";
|
||||
|
||||
$imageData = base64_decode($image->data);
|
||||
file_put_contents($filename, $imageData);
|
||||
|
||||
echo "Saved: {$filename} ({$image->width}x{$image->height})\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 7: File Type-Specific Image Extraction\n";
|
||||
echo "==============================================\n";
|
||||
|
||||
$pdfConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 300,
|
||||
minHeight: 300,
|
||||
performOcr: false
|
||||
)
|
||||
);
|
||||
|
||||
$pptxConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100,
|
||||
performOcr: false
|
||||
)
|
||||
);
|
||||
|
||||
$imageConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
performOcr: true,
|
||||
minWidth: 50,
|
||||
minHeight: 50
|
||||
),
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
);
|
||||
|
||||
echo "PDF Configuration:\n";
|
||||
echo "- Min size: 300x300 (larger images only)\n";
|
||||
echo "- OCR: Disabled (PDFs have embedded text)\n\n";
|
||||
|
||||
echo "PowerPoint Configuration:\n";
|
||||
echo "- Min size: 100x100 (include icons/logos)\n";
|
||||
echo "- OCR: Disabled\n\n";
|
||||
|
||||
echo "Image File Configuration:\n";
|
||||
echo "- Min size: 50x50 (all images)\n";
|
||||
echo "- OCR: Enabled\n\n";
|
||||
|
||||
echo "Example 8: Complete Image Processing Pipeline\n";
|
||||
echo "=============================================\n";
|
||||
|
||||
$config8 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
performOcr: true,
|
||||
minWidth: 200,
|
||||
minHeight: 200
|
||||
),
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$result8 = (new Kreuzberg($config8))->extractFile('mixed_content.pdf');
|
||||
|
||||
if ($result8->images !== null) {
|
||||
echo "Extracted images: " . count($result8->images) . "\n\n";
|
||||
|
||||
foreach ($result8->images as $i => $image) {
|
||||
echo "Processing Image {$i}:\n";
|
||||
|
||||
$isValid = $image->width >= 200 && $image->height >= 200;
|
||||
echo "- Valid size: " . ($isValid ? 'Yes' : 'No') . "\n";
|
||||
|
||||
$filename = "image_{$i}.{$image->format}";
|
||||
file_put_contents($filename, base64_decode($image->data));
|
||||
echo "- Saved: {$filename}\n";
|
||||
|
||||
if ($image->ocrResult !== null) {
|
||||
$ocrText = trim($image->ocrResult->content);
|
||||
if (!empty($ocrText)) {
|
||||
echo "- OCR text available: " . strlen($ocrText) . " characters\n";
|
||||
file_put_contents("image_{$i}_ocr.txt", $ocrText);
|
||||
}
|
||||
}
|
||||
|
||||
$metadata = [
|
||||
'format' => $image->format,
|
||||
'width' => $image->width,
|
||||
'height' => $image->height,
|
||||
'page' => $image->pageNumber,
|
||||
'aspect_ratio' => round($image->width / $image->height, 2),
|
||||
];
|
||||
file_put_contents("image_{$i}_metadata.json", json_encode($metadata, JSON_PRETTY_PRINT));
|
||||
|
||||
echo "- Metadata saved\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nImage Extraction Configuration Parameters:\n";
|
||||
echo "==========================================\n";
|
||||
echo "- extractImages: Enable image extraction (default: false)\n";
|
||||
echo "- performOcr: Run OCR on extracted images (default: false)\n";
|
||||
echo "- minWidth: Minimum image width in pixels (default: 100)\n";
|
||||
echo "- minHeight: Minimum image height in pixels (default: 100)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "- Set minWidth/minHeight to filter out unwanted small images\n";
|
||||
echo "- Use 200x200 as a good default for meaningful images\n";
|
||||
echo "- Use 800x600+ for large photos and diagrams only\n";
|
||||
echo "- Use 50x50 to include all images including icons\n";
|
||||
echo "- Enable performOcr only when images contain text\n";
|
||||
echo "- Combine with OCR config for multilingual text in images\n";
|
||||
echo "- Save images to disk for further processing\n";
|
||||
|
||||
echo "\n\nCommon Use Cases:\n";
|
||||
echo "=================\n";
|
||||
echo "1. Extract photos from reports: minWidth=800, minHeight=600\n";
|
||||
echo "2. Extract all graphics: minWidth=100, minHeight=100\n";
|
||||
echo "3. OCR on images: performOcr=true + OcrConfig\n";
|
||||
echo "4. Extract logos/icons: minWidth=50, minHeight=50\n";
|
||||
```
|
||||
302
docs/snippets/php/configuration/image_preprocessing_config.php
Normal file
302
docs/snippets/php/configuration/image_preprocessing_config.php
Normal file
@@ -0,0 +1,302 @@
|
||||
```php title="image_preprocessing_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Preprocessing Configuration
|
||||
*
|
||||
* This example demonstrates image preprocessing options to improve OCR accuracy.
|
||||
* Preprocessing can significantly enhance text recognition quality for poor-quality scans.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\ImagePreprocessingConfig;
|
||||
|
||||
echo "Example 1: Default Image Preprocessing\n";
|
||||
echo "======================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig()
|
||||
)
|
||||
);
|
||||
|
||||
echo "Default preprocessing settings:\n";
|
||||
echo "- Target DPI: 300 (standard for OCR)\n";
|
||||
echo "- Auto-rotate: Enabled\n";
|
||||
echo "- Denoise: Disabled\n\n";
|
||||
|
||||
echo "Example 2: High DPI Configuration\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 600
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Target DPI: 600\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Very small text\n";
|
||||
echo "- High-quality scans\n";
|
||||
echo "- Documents with fine details\n";
|
||||
echo "Note: Higher DPI = slower processing, more memory\n\n";
|
||||
|
||||
echo "Example 3: Lower DPI for Speed\n";
|
||||
echo "==============================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 150
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Target DPI: 150\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Large text\n";
|
||||
echo "- Low-resolution images\n";
|
||||
echo "- Fast processing needed\n";
|
||||
echo "Note: May reduce accuracy for small text\n\n";
|
||||
|
||||
echo "Example 4: Manual Rotation Control\n";
|
||||
echo "==================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
autoRotate: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Auto-rotate: Disabled\n";
|
||||
echo "Use when:\n";
|
||||
echo "- Images are already correctly oriented\n";
|
||||
echo "- Auto-rotation causes issues\n";
|
||||
echo "- Processing time is critical\n\n";
|
||||
|
||||
echo "Example 5: Denoising for Poor Quality Scans\n";
|
||||
echo "===========================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config5);
|
||||
$result = $kreuzberg->extractFile('noisy_scan.pdf');
|
||||
|
||||
echo "Denoising: Enabled\n";
|
||||
echo "Best for:\n";
|
||||
echo "- Poor quality scans\n";
|
||||
echo "- Fax documents\n";
|
||||
echo "- Images with background noise\n";
|
||||
echo "- Old or damaged documents\n";
|
||||
echo "\nExtracted text length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
echo "Example 6: Maximum Quality Configuration\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Maximum quality preprocessing:\n";
|
||||
echo "- Target DPI: 600 (high quality)\n";
|
||||
echo "- Auto-rotate: Enabled\n";
|
||||
echo "- Denoise: Enabled\n";
|
||||
echo "\nBest for:\n";
|
||||
echo "- Very poor quality scans\n";
|
||||
echo "- Historical documents\n";
|
||||
echo "- Faded or damaged text\n";
|
||||
echo "- Critical accuracy requirements\n\n";
|
||||
|
||||
echo "Example 7: Fast Processing Configuration\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 200,
|
||||
autoRotate: false,
|
||||
denoise: false
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Fast processing configuration:\n";
|
||||
echo "- Target DPI: 200 (faster)\n";
|
||||
echo "- Auto-rotate: Disabled\n";
|
||||
echo "- Denoise: Disabled\n";
|
||||
echo "\nBest for:\n";
|
||||
echo "- High-volume processing\n";
|
||||
echo "- Good quality source images\n";
|
||||
echo "- Performance-critical applications\n\n";
|
||||
|
||||
echo "Example 8: DPI Recommendations by Document Type\n";
|
||||
echo "===============================================\n";
|
||||
|
||||
$standardConfig = new ImagePreprocessingConfig(targetDpi: 300);
|
||||
echo "Standard documents (letters, reports): 300 DPI\n";
|
||||
|
||||
$newspaperConfig = new ImagePreprocessingConfig(targetDpi: 400);
|
||||
echo "Newspapers and magazines: 400 DPI\n";
|
||||
|
||||
$bookConfig = new ImagePreprocessingConfig(targetDpi: 600);
|
||||
echo "Books with small text: 600 DPI\n";
|
||||
|
||||
$receiptConfig = new ImagePreprocessingConfig(targetDpi: 300);
|
||||
echo "Receipts and forms: 300 DPI\n";
|
||||
|
||||
$businessCardConfig = new ImagePreprocessingConfig(targetDpi: 400);
|
||||
echo "Business cards: 400 DPI\n";
|
||||
|
||||
$faxConfig = new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
denoise: true
|
||||
);
|
||||
echo "Faxes: 300 DPI + denoising\n\n";
|
||||
|
||||
echo "Example 9: Adaptive Configuration by Image Quality\n";
|
||||
echo "==================================================\n";
|
||||
|
||||
function getPreprocessingConfig(string $quality): ImagePreprocessingConfig
|
||||
{
|
||||
return match ($quality) {
|
||||
'excellent' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: false,
|
||||
denoise: false
|
||||
),
|
||||
'good' => new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: false
|
||||
),
|
||||
'fair' => new ImagePreprocessingConfig(
|
||||
targetDpi: 400,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
),
|
||||
'poor' => new ImagePreprocessingConfig(
|
||||
targetDpi: 600,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
),
|
||||
default => new ImagePreprocessingConfig(),
|
||||
};
|
||||
}
|
||||
|
||||
echo "Quality-based configurations:\n\n";
|
||||
|
||||
echo "Excellent Quality:\n";
|
||||
echo "- DPI: 300, Auto-rotate: No, Denoise: No\n";
|
||||
echo "- Clean scans, properly oriented\n\n";
|
||||
|
||||
echo "Good Quality:\n";
|
||||
echo "- DPI: 300, Auto-rotate: Yes, Denoise: No\n";
|
||||
echo "- May need rotation correction\n\n";
|
||||
|
||||
echo "Fair Quality:\n";
|
||||
echo "- DPI: 400, Auto-rotate: Yes, Denoise: Yes\n";
|
||||
echo "- Some noise or quality issues\n\n";
|
||||
|
||||
echo "Poor Quality:\n";
|
||||
echo "- DPI: 600, Auto-rotate: Yes, Denoise: Yes\n";
|
||||
echo "- Significant quality problems\n\n";
|
||||
|
||||
echo "Example 10: Complete OCR Pipeline with Preprocessing\n";
|
||||
echo "===================================================\n";
|
||||
|
||||
$config10 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
imagePreprocessing: new ImagePreprocessingConfig(
|
||||
targetDpi: 300,
|
||||
autoRotate: true,
|
||||
denoise: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result10 = (new Kreuzberg($config10))->extractFile('poor_quality_scan.pdf');
|
||||
|
||||
echo "Processing pipeline:\n";
|
||||
echo "1. Load image\n";
|
||||
echo "2. Auto-detect orientation and rotate if needed\n";
|
||||
echo "3. Upscale/downscale to 300 DPI\n";
|
||||
echo "4. Apply denoising filter\n";
|
||||
echo "5. Perform OCR\n";
|
||||
echo "\nResults:\n";
|
||||
echo "- Extracted text: " . strlen($result10->content) . " characters\n";
|
||||
echo "- Pages: " . ($result10->metadata->pageCount ?? 'N/A') . "\n";
|
||||
|
||||
echo "\n\nImage Preprocessing Parameters:\n";
|
||||
echo "================================\n";
|
||||
echo "- targetDpi: Target resolution in dots per inch\n";
|
||||
echo " * 150 DPI: Fast, lower quality\n";
|
||||
echo " * 300 DPI: Standard, good balance (RECOMMENDED)\n";
|
||||
echo " * 400 DPI: Better for small text\n";
|
||||
echo " * 600 DPI: Best quality, slower\n";
|
||||
echo "\n";
|
||||
echo "- autoRotate: Automatically detect and correct orientation\n";
|
||||
echo " * true: Recommended for most cases\n";
|
||||
echo " * false: Skip if images are already oriented\n";
|
||||
echo "\n";
|
||||
echo "- denoise: Apply noise reduction filter\n";
|
||||
echo " * true: Recommended for poor quality scans\n";
|
||||
echo " * false: Skip for clean images (faster)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Start with 300 DPI as a baseline\n";
|
||||
echo "2. Enable auto-rotate unless you know images are correct\n";
|
||||
echo "3. Enable denoising for poor quality documents\n";
|
||||
echo "4. Use higher DPI (400-600) for small text\n";
|
||||
echo "5. Use lower DPI (150-200) when speed is critical\n";
|
||||
echo "6. Test different settings to find optimal balance\n";
|
||||
echo "7. Consider source quality when choosing settings\n";
|
||||
echo "8. Remember: Higher quality = slower processing + more memory\n";
|
||||
|
||||
echo "\n\nPerformance vs Quality Trade-offs:\n";
|
||||
echo "==================================\n";
|
||||
echo "Fastest: DPI=150, AutoRotate=No, Denoise=No\n";
|
||||
echo "Balanced: DPI=300, AutoRotate=Yes, Denoise=No (RECOMMENDED)\n";
|
||||
echo "Quality: DPI=400, AutoRotate=Yes, Denoise=Yes\n";
|
||||
echo "Maximum: DPI=600, AutoRotate=Yes, Denoise=Yes\n";
|
||||
```
|
||||
115
docs/snippets/php/configuration/keyword_config.php
Normal file
115
docs/snippets/php/configuration/keyword_config.php
Normal file
@@ -0,0 +1,115 @@
|
||||
```php title="keyword_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* KeywordConfig - Keyword Extraction
|
||||
*
|
||||
* Automatically extract keywords and key phrases from documents.
|
||||
* Useful for document categorization, search indexing, and summarization.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keyword: new KeywordConfig(
|
||||
maxKeywords: 10,
|
||||
minScore: 0.0,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('article.pdf');
|
||||
|
||||
echo "Top Keywords:\n";
|
||||
echo str_repeat('=', 40) . "\n";
|
||||
foreach ($result->metadata->keywords ?? [] as $keyword) {
|
||||
echo " • $keyword\n";
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
$detailedConfig = new ExtractionConfig(
|
||||
keyword: new KeywordConfig(
|
||||
maxKeywords: 25,
|
||||
minScore: 0.0,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($detailedConfig);
|
||||
$result = $kreuzberg->extractFile('research_paper.pdf');
|
||||
|
||||
echo "Detailed keyword analysis:\n";
|
||||
echo "Total keywords: " . count($result->metadata->keywords ?? []) . "\n";
|
||||
|
||||
if (!empty($result->metadata->keywords)) {
|
||||
$grouped = [];
|
||||
foreach ($result->metadata->keywords as $keyword) {
|
||||
$first = strtoupper($keyword[0]);
|
||||
if (!isset($grouped[$first])) {
|
||||
$grouped[$first] = [];
|
||||
}
|
||||
$grouped[$first][] = $keyword;
|
||||
}
|
||||
|
||||
foreach ($grouped as $letter => $keywords) {
|
||||
echo "\n$letter:\n";
|
||||
foreach ($keywords as $keyword) {
|
||||
echo " - $keyword\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
|
||||
$allKeywords = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
if (!file_exists($file)) continue;
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
foreach ($result->metadata->keywords ?? [] as $keyword) {
|
||||
if (!isset($allKeywords[$keyword])) {
|
||||
$allKeywords[$keyword] = 0;
|
||||
}
|
||||
$allKeywords[$keyword]++;
|
||||
}
|
||||
}
|
||||
|
||||
arsort($allKeywords);
|
||||
echo "\n\nMost common keywords across documents:\n";
|
||||
$count = 0;
|
||||
foreach ($allKeywords as $keyword => $frequency) {
|
||||
if ($count++ >= 10) break;
|
||||
echo sprintf(" %2d. %-30s (appears in %d documents)\n",
|
||||
$count, $keyword, $frequency);
|
||||
}
|
||||
|
||||
$categoryKeywords = [
|
||||
'technology' => ['software', 'computer', 'algorithm', 'data', 'system'],
|
||||
'business' => ['market', 'revenue', 'sales', 'customer', 'profit'],
|
||||
'science' => ['research', 'experiment', 'hypothesis', 'analysis', 'study'],
|
||||
];
|
||||
|
||||
$docKeywords = $result->metadata->keywords ?? [];
|
||||
$scores = [];
|
||||
|
||||
foreach ($categoryKeywords as $category => $terms) {
|
||||
$score = 0;
|
||||
foreach ($terms as $term) {
|
||||
if (in_array($term, $docKeywords, true)) {
|
||||
$score++;
|
||||
}
|
||||
}
|
||||
$scores[$category] = $score;
|
||||
}
|
||||
|
||||
arsort($scores);
|
||||
$topCategory = array_key_first($scores);
|
||||
echo "\nDocument category: $topCategory (score: {$scores[$topCategory]})\n";
|
||||
```
|
||||
@@ -0,0 +1,97 @@
|
||||
```php title="language_detection_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* LanguageDetectionConfig - Language Detection
|
||||
*
|
||||
* Automatically detect the languages present in a document.
|
||||
* Useful for multilingual documents and routing to appropriate OCR languages.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('multilingual.pdf');
|
||||
|
||||
echo "Detected languages:\n";
|
||||
foreach ($result->detectedLanguages ?? [] as $lang) {
|
||||
echo " - $lang\n";
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
$advancedConfig = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
maxLanguages: 3,
|
||||
confidenceThreshold: 0.8
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($advancedConfig);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
if (!empty($result->detectedLanguages)) {
|
||||
echo "High-confidence languages detected:\n";
|
||||
echo implode(', ', $result->detectedLanguages) . "\n\n";
|
||||
} else {
|
||||
echo "No languages detected with sufficient confidence\n\n";
|
||||
}
|
||||
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$detectConfig = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(enabled: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($detectConfig);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
|
||||
if (!empty($result->detectedLanguages)) {
|
||||
$primaryLanguage = $result->detectedLanguages[0];
|
||||
echo "Primary language detected: $primaryLanguage\n";
|
||||
echo "Re-processing with OCR optimized for $primaryLanguage...\n";
|
||||
|
||||
$ocrConfig = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: $primaryLanguage
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($ocrConfig);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
echo "OCR extraction complete\n";
|
||||
}
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
|
||||
$languageMap = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
if (!file_exists($file)) continue;
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
$lang = $result->detectedLanguages[0] ?? 'unknown';
|
||||
|
||||
if (!isset($languageMap[$lang])) {
|
||||
$languageMap[$lang] = [];
|
||||
}
|
||||
$languageMap[$lang][] = $file;
|
||||
}
|
||||
|
||||
echo "\nDocuments grouped by language:\n";
|
||||
foreach ($languageMap as $lang => $docs) {
|
||||
echo "$lang: " . implode(', ', $docs) . "\n";
|
||||
}
|
||||
```
|
||||
205
docs/snippets/php/configuration/ocr_config.php
Normal file
205
docs/snippets/php/configuration/ocr_config.php
Normal file
@@ -0,0 +1,205 @@
|
||||
```php title="ocr_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* OCR Configuration
|
||||
*
|
||||
* This example demonstrates how to configure OCR (Optical Character Recognition)
|
||||
* for extracting text from scanned documents and images.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
|
||||
echo "Example 1: Basic OCR Configuration\n";
|
||||
echo "==================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config1);
|
||||
$result = $kreuzberg->extractFile('scanned_document.pdf');
|
||||
echo "Extracted text length: " . strlen($result->content) . " characters\n\n";
|
||||
|
||||
echo "Example 2: Multi-Language OCR\n";
|
||||
echo "=============================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+fra+deu'
|
||||
)
|
||||
);
|
||||
|
||||
echo "Configured for languages: English, French, German\n";
|
||||
echo "Use this for multilingual documents\n\n";
|
||||
|
||||
echo "Example 3: Language-Specific OCR\n";
|
||||
echo "================================\n";
|
||||
|
||||
$config3a = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'spa')
|
||||
);
|
||||
|
||||
$config3b = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'fra')
|
||||
);
|
||||
|
||||
$config3c = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'deu')
|
||||
);
|
||||
|
||||
$config3d = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_sim')
|
||||
);
|
||||
|
||||
$config3e = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'chi_tra')
|
||||
);
|
||||
|
||||
$config3f = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'jpn')
|
||||
);
|
||||
|
||||
$config3g = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'kor')
|
||||
);
|
||||
|
||||
$config3h = new ExtractionConfig(
|
||||
ocr: new OcrConfig(backend: 'tesseract', language: 'ara')
|
||||
);
|
||||
|
||||
echo "Common Tesseract Language Codes:\n";
|
||||
echo "- eng: English\n";
|
||||
echo "- fra: French\n";
|
||||
echo "- deu: German\n";
|
||||
echo "- spa: Spanish\n";
|
||||
echo "- ita: Italian\n";
|
||||
echo "- por: Portuguese\n";
|
||||
echo "- rus: Russian\n";
|
||||
echo "- chi_sim: Chinese (Simplified)\n";
|
||||
echo "- chi_tra: Chinese (Traditional)\n";
|
||||
echo "- jpn: Japanese\n";
|
||||
echo "- kor: Korean\n";
|
||||
echo "- ara: Arabic\n\n";
|
||||
|
||||
echo "Example 4: Advanced Tesseract Configuration\n";
|
||||
echo "==========================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
enableTableDetection: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Tesseract Configuration:\n";
|
||||
echo "- PSM (Page Segmentation Mode): 6 (uniform text block)\n";
|
||||
echo "- OEM (OCR Engine Mode): 3 (LSTM only)\n";
|
||||
echo "- Table Detection: Enabled\n\n";
|
||||
|
||||
echo "Example 5: OCR for Forms and Invoices\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
enableTableDetection: true,
|
||||
tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$.,- '
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Optimized for forms and invoices:\n";
|
||||
echo "- Table detection enabled\n";
|
||||
echo "- Character whitelist for common form characters\n\n";
|
||||
|
||||
echo "Example 6: OCR for Numeric Documents\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
tesseditCharWhitelist: '0123456789$.,- '
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Character whitelist: '0123456789$.,- '\n";
|
||||
echo "Best for: Invoices, receipts, financial documents\n\n";
|
||||
|
||||
echo "Example 7: OCR with Character Blacklist\n";
|
||||
echo "=======================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
tesseditCharBlacklist: '|!@#%^&*()'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Character blacklist: '|!@#%^&*()'\n";
|
||||
echo "Use to exclude problematic characters\n\n";
|
||||
|
||||
echo "\nPage Segmentation Modes (PSM):\n";
|
||||
echo "==============================\n";
|
||||
echo "0 = Orientation and script detection (OSD) only\n";
|
||||
echo "1 = Automatic page segmentation with OSD\n";
|
||||
echo "2 = Automatic page segmentation (no OSD or OCR)\n";
|
||||
echo "3 = Fully automatic page segmentation (default)\n";
|
||||
echo "4 = Assume a single column of text of variable sizes\n";
|
||||
echo "5 = Assume a single uniform block of vertically aligned text\n";
|
||||
echo "6 = Assume a single uniform block of text (recommended for most)\n";
|
||||
echo "7 = Treat the image as a single text line\n";
|
||||
echo "8 = Treat the image as a single word\n";
|
||||
echo "9 = Treat the image as a single word in a circle\n";
|
||||
echo "10 = Treat the image as a single character\n";
|
||||
echo "11 = Sparse text. Find as much text as possible\n";
|
||||
echo "12 = Sparse text with OSD\n";
|
||||
echo "13 = Raw line. Treat the image as a single text line\n";
|
||||
|
||||
echo "\n\nOCR Engine Modes (OEM):\n";
|
||||
echo "======================\n";
|
||||
echo "0 = Legacy engine only\n";
|
||||
echo "1 = Neural nets LSTM engine only\n";
|
||||
echo "2 = Legacy + LSTM engines\n";
|
||||
echo "3 = Default, based on what is available (recommended)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "- Use PSM 6 for general documents\n";
|
||||
echo "- Use PSM 11 for sparse text (screenshots, signs)\n";
|
||||
echo "- Use OEM 3 (default) for best results\n";
|
||||
echo "- Enable table detection for structured documents\n";
|
||||
echo "- Use character whitelists for forms/invoices\n";
|
||||
echo "- Combine multiple languages with '+' separator\n";
|
||||
echo "- Preprocess images for better accuracy (see image_preprocessing.php)\n";
|
||||
```
|
||||
82
docs/snippets/php/configuration/page_config.php
Normal file
82
docs/snippets/php/configuration/page_config.php
Normal file
@@ -0,0 +1,82 @@
|
||||
```php title="page_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PageConfig - Page-Level Extraction
|
||||
*
|
||||
* Configure per-page content extraction and page markers for maintaining
|
||||
* document structure in the extracted text.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: false,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: '--- Page {page_number} ---'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('report.pdf');
|
||||
|
||||
echo "Content with page markers:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
$pageConfig = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: false
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($pageConfig);
|
||||
$result = $kreuzberg->extractFile('multi_page.pdf');
|
||||
|
||||
foreach ($result->pages ?? [] as $page) {
|
||||
echo "Page {$page->pageNumber}:\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
echo substr($page->content, 0, 200) . "...\n";
|
||||
echo "Tables on page: " . count($page->tables) . "\n";
|
||||
echo "Images on page: " . count($page->images) . "\n\n";
|
||||
}
|
||||
|
||||
$customConfig = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: false,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: "\n\n========== PAGE {page_number} ==========\n\n"
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($customConfig);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
$pages = preg_split('/={10} PAGE \d+ ={10}/', $result->content);
|
||||
echo "Split into " . count($pages) . " sections\n";
|
||||
|
||||
$allPagesConfig = new ExtractionConfig(
|
||||
page: new PageConfig(extractPages: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($allPagesConfig);
|
||||
$result = $kreuzberg->extractFile('large_doc.pdf');
|
||||
|
||||
$selectedPages = array_filter(
|
||||
$result->pages ?? [],
|
||||
fn($page) => $page->pageNumber >= 10 && $page->pageNumber <= 20
|
||||
);
|
||||
|
||||
echo "\nSelected pages 10-20:\n";
|
||||
foreach ($selectedPages as $page) {
|
||||
echo "Page {$page->pageNumber}: " . strlen($page->content) . " chars\n";
|
||||
}
|
||||
```
|
||||
70
docs/snippets/php/configuration/pdf_config.php
Normal file
70
docs/snippets/php/configuration/pdf_config.php
Normal file
@@ -0,0 +1,70 @@
|
||||
```php title="pdf_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PdfConfig - PDF-Specific Configuration
|
||||
*
|
||||
* Configure PDF extraction behavior including image quality, text extraction
|
||||
* methods, and performance optimization.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 85,
|
||||
preserveImageFormat: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "PDF extraction complete\n";
|
||||
echo "Images extracted: " . count($result->images ?? []) . "\n\n";
|
||||
|
||||
$highQualityConfig = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 100,
|
||||
preserveImageFormat: true
|
||||
),
|
||||
extractImages: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($highQualityConfig);
|
||||
$result = $kreuzberg->extractFile('presentation.pdf');
|
||||
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = sprintf('image_%d_page_%d.%s',
|
||||
$image->imageIndex,
|
||||
$image->pageNumber,
|
||||
$image->format
|
||||
);
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved high-quality image: $filename ({$image->width}x{$image->height})\n";
|
||||
}
|
||||
|
||||
$fastConfig = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: false,
|
||||
imageQuality: 50
|
||||
),
|
||||
extractTables: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($fastConfig);
|
||||
$start = microtime(true);
|
||||
$result = $kreuzberg->extractFile('large_document.pdf');
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "\nFast extraction completed in " . number_format($elapsed, 3) . " seconds\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
```
|
||||
71
docs/snippets/php/configuration/pdf_hierarchy_config.php
Normal file
71
docs/snippets/php/configuration/pdf_hierarchy_config.php
Normal file
@@ -0,0 +1,71 @@
|
||||
```php title="pdf_hierarchy_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PdfHierarchyConfig - Hierarchy Detection Configuration
|
||||
*
|
||||
* Configure PDF document structure analysis and hierarchy detection
|
||||
* using k-clustering for document organization recognition.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
// Hierarchy detection in PDF options array
|
||||
$config = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
hierarchy: [
|
||||
'enabled' => true,
|
||||
'k_clusters' => 6,
|
||||
'include_bbox' => true,
|
||||
'ocr_coverage_threshold' => 0.8
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Hierarchy detection enabled\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
|
||||
// Alternative: Custom hierarchy parameters for complex documents
|
||||
$advancedConfig = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
hierarchy: [
|
||||
'enabled' => true,
|
||||
'k_clusters' => 12, // More clusters for detailed hierarchy
|
||||
'include_bbox' => true, // Include bounding box coordinates
|
||||
'ocr_coverage_threshold' => 0.7 // Higher OCR threshold
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($advancedConfig);
|
||||
$result = $kreuzberg->extractFile('complex_document.pdf');
|
||||
|
||||
echo "Advanced hierarchy detection completed\n";
|
||||
echo "Detected structure preserved in output\n";
|
||||
|
||||
// Disabling hierarchy detection for speed
|
||||
$fastConfig = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: false,
|
||||
hierarchy: [
|
||||
'enabled' => false
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($fastConfig);
|
||||
$result = $kreuzberg->extractFile('simple_document.pdf');
|
||||
|
||||
echo "Fast extraction without hierarchy detection\n";
|
||||
```
|
||||
313
docs/snippets/php/configuration/tesseract_config.php
Normal file
313
docs/snippets/php/configuration/tesseract_config.php
Normal file
@@ -0,0 +1,313 @@
|
||||
```php title="tesseract_config.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Tesseract OCR Configuration
|
||||
*
|
||||
* This example demonstrates advanced Tesseract OCR configuration options
|
||||
* for fine-tuning OCR performance and accuracy.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
|
||||
echo "Example 1: Default Tesseract Configuration\n";
|
||||
echo "==========================================\n";
|
||||
|
||||
$config1 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig()
|
||||
)
|
||||
);
|
||||
|
||||
echo "Default settings:\n";
|
||||
echo "- PSM: 3 (Fully automatic page segmentation)\n";
|
||||
echo "- OEM: 3 (Default, based on what's available)\n";
|
||||
echo "- Table Detection: Disabled\n\n";
|
||||
|
||||
echo "Example 2: Different Page Segmentation Modes\n";
|
||||
echo "============================================\n";
|
||||
|
||||
$config2a = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(psm: 6)
|
||||
)
|
||||
);
|
||||
|
||||
echo "PSM 6 - Uniform block of text:\n";
|
||||
echo "- Best for: Most documents, clean text blocks\n";
|
||||
echo "- Use when: Document has clear text structure\n\n";
|
||||
|
||||
$config2b = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(psm: 11)
|
||||
)
|
||||
);
|
||||
|
||||
echo "PSM 11 - Sparse text:\n";
|
||||
echo "- Best for: Screenshots, signs, sparse documents\n";
|
||||
echo "- Use when: Text is scattered across the image\n\n";
|
||||
|
||||
$config2c = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(psm: 7)
|
||||
)
|
||||
);
|
||||
|
||||
echo "PSM 7 - Single text line:\n";
|
||||
echo "- Best for: Single line of text, headers, captions\n";
|
||||
echo "- Use when: Processing individual text lines\n\n";
|
||||
|
||||
$config2d = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(psm: 8)
|
||||
)
|
||||
);
|
||||
|
||||
echo "PSM 8 - Single word:\n";
|
||||
echo "- Best for: Individual words, labels\n";
|
||||
echo "- Use when: Processing single words\n\n";
|
||||
|
||||
echo "Example 3: Table Detection\n";
|
||||
echo "=========================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
enableTableDetection: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config3);
|
||||
$result = $kreuzberg->extractFile('scanned_invoice.pdf');
|
||||
|
||||
echo "Table detection enabled\n";
|
||||
echo "Best for: Forms, invoices, spreadsheets, reports\n";
|
||||
|
||||
if (count($result->tables) > 0) {
|
||||
echo "\nExtracted tables: " . count($result->tables) . "\n";
|
||||
foreach ($result->tables as $i => $table) {
|
||||
echo "\nTable " . ($i + 1) . ":\n";
|
||||
echo $table->markdown . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n\n";
|
||||
|
||||
echo "Example 4: Character Whitelisting\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$config4a = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
tesseditCharWhitelist: '0123456789'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Whitelist: '0123456789' (digits only)\n";
|
||||
echo "Best for: Serial numbers, IDs, numeric codes\n\n";
|
||||
|
||||
$config4b = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Whitelist: Letters and numbers only\n";
|
||||
echo "Best for: Product codes, alphanumeric IDs\n\n";
|
||||
|
||||
$config4c = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
tesseditCharWhitelist: '0123456789$€£¥.,- '
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Whitelist: '0123456789$€£¥.,- ' (financial data)\n";
|
||||
echo "Best for: Invoices, receipts, price lists\n\n";
|
||||
|
||||
echo "Example 5: Character Blacklisting\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
tesseditCharBlacklist: '|!@#%^&*()'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Blacklist: '|!@#%^&*()'\n";
|
||||
echo "Use to: Exclude problematic characters that cause OCR errors\n\n";
|
||||
|
||||
echo "Example 6: OCR Engine Modes\n";
|
||||
echo "===========================\n";
|
||||
|
||||
$config6a = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(oem: 0)
|
||||
)
|
||||
);
|
||||
|
||||
echo "OEM 0 - Legacy engine:\n";
|
||||
echo "- Older, simpler algorithm\n";
|
||||
echo "- Sometimes better for very low-quality scans\n\n";
|
||||
|
||||
$config6b = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(oem: 1)
|
||||
)
|
||||
);
|
||||
|
||||
echo "OEM 1 - LSTM neural network:\n";
|
||||
echo "- Modern deep learning approach\n";
|
||||
echo "- Better accuracy for most documents\n\n";
|
||||
|
||||
$config6c = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(oem: 3)
|
||||
)
|
||||
);
|
||||
|
||||
echo "OEM 3 - Default (recommended):\n";
|
||||
echo "- Chooses best available engine\n";
|
||||
echo "- Use this unless you have specific needs\n\n";
|
||||
|
||||
echo "Example 7: Complete Invoice Processing Configuration\n";
|
||||
echo "====================================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
enableTableDetection: true,
|
||||
tesseditCharWhitelist: '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz$€£.,- :#/'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Invoice-optimized configuration:\n";
|
||||
echo "- PSM 6: Structured text\n";
|
||||
echo "- Table detection: Enabled\n";
|
||||
echo "- Character whitelist: Alphanumeric + currency + common symbols\n";
|
||||
echo "- Best for: Invoices, receipts, financial documents\n\n";
|
||||
|
||||
echo "Example 8: Complete Form Processing Configuration\n";
|
||||
echo "=================================================\n";
|
||||
|
||||
$config8 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
enableTableDetection: true,
|
||||
tesseditCharWhitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,- @'
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Form-optimized configuration:\n";
|
||||
echo "- PSM 6: Structured text\n";
|
||||
echo "- Table detection: Enabled\n";
|
||||
echo "- Character whitelist: Alphanumeric + common form characters\n";
|
||||
echo "- Best for: Forms, applications, surveys\n\n";
|
||||
|
||||
echo "Example 9: Sparse Text Configuration\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config9 = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 11,
|
||||
oem: 3
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
echo "Sparse text configuration:\n";
|
||||
echo "- PSM 11: Find scattered text\n";
|
||||
echo "- Best for: Screenshots, signs, posters, sparse documents\n\n";
|
||||
|
||||
echo "\nAll Page Segmentation Modes:\n";
|
||||
echo "============================\n";
|
||||
echo "0 = OSD only (orientation and script detection)\n";
|
||||
echo "1 = Automatic page segmentation with OSD\n";
|
||||
echo "2 = Automatic page segmentation (no OSD or OCR)\n";
|
||||
echo "3 = Fully automatic page segmentation (default)\n";
|
||||
echo "4 = Single column of variable-sized text\n";
|
||||
echo "5 = Single uniform block of vertically aligned text\n";
|
||||
echo "6 = Single uniform block of text (RECOMMENDED)\n";
|
||||
echo "7 = Single text line\n";
|
||||
echo "8 = Single word\n";
|
||||
echo "9 = Single word in a circle\n";
|
||||
echo "10 = Single character\n";
|
||||
echo "11 = Sparse text (RECOMMENDED for screenshots)\n";
|
||||
echo "12 = Sparse text with OSD\n";
|
||||
echo "13 = Raw line\n";
|
||||
|
||||
echo "\n\nOCR Engine Modes:\n";
|
||||
echo "=================\n";
|
||||
echo "0 = Legacy engine only\n";
|
||||
echo "1 = LSTM neural network only\n";
|
||||
echo "2 = Legacy + LSTM\n";
|
||||
echo "3 = Default (RECOMMENDED)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Start with PSM 6 and OEM 3 (defaults)\n";
|
||||
echo "2. Use PSM 11 for sparse/scattered text\n";
|
||||
echo "3. Enable table detection for structured documents\n";
|
||||
echo "4. Use character whitelists for constrained input\n";
|
||||
echo "5. Use blacklists to exclude problem characters\n";
|
||||
echo "6. Test different PSM values if accuracy is poor\n";
|
||||
echo "7. Combine with image preprocessing for better results\n";
|
||||
```
|
||||
139
docs/snippets/php/docker/usage.php
Normal file
139
docs/snippets/php/docker/usage.php
Normal file
@@ -0,0 +1,139 @@
|
||||
```php title="usage.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Docker Kreuzberg Client
|
||||
*
|
||||
* This example demonstrates how to interact with Kreuzberg running in a Docker container.
|
||||
* It shows how to start a container, extract content from files via the API, and cleanup.
|
||||
*/
|
||||
|
||||
class DockerKreuzbergClient
|
||||
{
|
||||
private string $containerName;
|
||||
private string $containerImage;
|
||||
private int $apiPort;
|
||||
private string $apiUrl;
|
||||
|
||||
public function __construct(
|
||||
string $containerName = 'kreuzberg-api',
|
||||
string $containerImage = 'kreuzberg:latest',
|
||||
int $apiPort = 8000
|
||||
) {
|
||||
$this->containerName = $containerName;
|
||||
$this->containerImage = $containerImage;
|
||||
$this->apiPort = $apiPort;
|
||||
$this->apiUrl = "http://localhost:{$apiPort}/api/extract";
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the Kreuzberg Docker container
|
||||
*
|
||||
* @throws RuntimeException if container fails to start
|
||||
*/
|
||||
public function startContainer(): void
|
||||
{
|
||||
echo "Starting Kreuzberg Docker container...\n";
|
||||
|
||||
$cmd = sprintf(
|
||||
'docker run -d --name %s -p %d:8000 %s',
|
||||
escapeshellarg($this->containerName),
|
||||
$this->apiPort,
|
||||
escapeshellarg($this->containerImage)
|
||||
);
|
||||
|
||||
exec($cmd, $output, $returnCode);
|
||||
|
||||
if ($returnCode !== 0) {
|
||||
throw new RuntimeException("Failed to start container: " . implode("\n", $output));
|
||||
}
|
||||
|
||||
echo "Container started on http://localhost:{$this->apiPort}\n";
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from a file using the Docker API
|
||||
*
|
||||
* @param string $filePath Path to the file to extract
|
||||
* @return string Extracted content
|
||||
* @throws RuntimeException if extraction fails
|
||||
*/
|
||||
public function extractFile(string $filePath): string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
throw new RuntimeException("File not found: {$filePath}");
|
||||
}
|
||||
|
||||
$boundary = '----WebKitFormBoundary' . bin2hex(random_bytes(16));
|
||||
$fileContent = file_get_contents($filePath);
|
||||
$fileName = basename($filePath);
|
||||
|
||||
$body = "--{$boundary}\r\n";
|
||||
$body .= "Content-Disposition: form-data; name=\"file\"; filename=\"{$fileName}\"\r\n";
|
||||
$body .= "Content-Type: application/octet-stream\r\n\r\n";
|
||||
$body .= $fileContent;
|
||||
$body .= "\r\n--{$boundary}--\r\n";
|
||||
|
||||
$ch = curl_init($this->apiUrl);
|
||||
curl_setopt_array($ch, [
|
||||
CURLOPT_POST => true,
|
||||
CURLOPT_POSTFIELDS => $body,
|
||||
CURLOPT_HTTPHEADER => [
|
||||
"Content-Type: multipart/form-data; boundary={$boundary}",
|
||||
"Content-Length: " . strlen($body),
|
||||
],
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
]);
|
||||
|
||||
$response = curl_exec($ch);
|
||||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||||
|
||||
if ($response === false) {
|
||||
$error = curl_error($ch);
|
||||
curl_close($ch);
|
||||
throw new RuntimeException("cURL error: {$error}");
|
||||
}
|
||||
|
||||
curl_close($ch);
|
||||
|
||||
if ($httpCode !== 200) {
|
||||
throw new RuntimeException("HTTP error {$httpCode}: {$response}");
|
||||
}
|
||||
|
||||
$result = json_decode($response, true);
|
||||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||||
throw new RuntimeException("JSON decode error: " . json_last_error_msg());
|
||||
}
|
||||
|
||||
return $result['content'] ?? '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop and remove the Docker container
|
||||
*/
|
||||
public function stopContainer(): void
|
||||
{
|
||||
echo "Stopping Kreuzberg Docker container...\n";
|
||||
|
||||
exec(sprintf('docker stop %s', escapeshellarg($this->containerName)), $output, $returnCode);
|
||||
exec(sprintf('docker rm %s', escapeshellarg($this->containerName)), $output, $returnCode);
|
||||
|
||||
echo "Container stopped and removed\n";
|
||||
}
|
||||
}
|
||||
|
||||
$dockerClient = new DockerKreuzbergClient();
|
||||
|
||||
try {
|
||||
$dockerClient->startContainer();
|
||||
|
||||
sleep(2);
|
||||
|
||||
$content = $dockerClient->extractFile('document.pdf');
|
||||
echo "Extracted content:\n{$content}\n";
|
||||
} finally {
|
||||
$dockerClient->stopContainer();
|
||||
}
|
||||
```
|
||||
215
docs/snippets/php/embeddings/basic_embeddings.php
Normal file
215
docs/snippets/php/embeddings/basic_embeddings.php
Normal file
@@ -0,0 +1,215 @@
|
||||
```php title="basic_embeddings.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Basic Embedding Generation
|
||||
*
|
||||
* Generate vector embeddings for semantic search and similarity matching.
|
||||
* Requires ONNX Runtime to be installed.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50
|
||||
),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Embedding Generation Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Chunks with embeddings: " . count($result->chunks ?? []) . "\n\n";
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
echo "Chunk {$chunk->metadata->chunkIndex}:\n";
|
||||
echo " Content length: " . strlen($chunk->content) . " chars\n";
|
||||
|
||||
if ($chunk->embedding !== null) {
|
||||
echo " Embedding dimension: " . count($chunk->embedding) . "\n";
|
||||
echo " First 5 values: [" . implode(', ', array_map(
|
||||
fn($v) => number_format($v, 4),
|
||||
array_slice($chunk->embedding, 0, 5)
|
||||
)) . "...]\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$models = [
|
||||
'all-MiniLM-L6-v2',
|
||||
'all-mpnet-base-v2',
|
||||
'paraphrase-multilingual-MiniLM-L12-v2',
|
||||
];
|
||||
|
||||
foreach ($models as $model) {
|
||||
echo "Testing model: $model\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 256),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: $model,
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$start = microtime(true);
|
||||
$result = $kreuzberg->extractFile('test_doc.pdf');
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
$chunk = ($result->chunks ?? [])[0] ?? null;
|
||||
if ($chunk && $chunk->embedding) {
|
||||
echo " Dimension: " . count($chunk->embedding) . "\n";
|
||||
echo " Time: " . number_format($elapsed, 3) . "s\n";
|
||||
echo " Chunks: " . count($result->chunks ?? []) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = 0.0;
|
||||
$magnitudeA = 0.0;
|
||||
$magnitudeB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magnitudeA += $a[$i] * $a[$i];
|
||||
$magnitudeB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "Chunk Similarity Analysis:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$chunks = $result->chunks ?? [];
|
||||
if (count($chunks) >= 2) {
|
||||
$referenceChunk = $chunks[0];
|
||||
|
||||
foreach (array_slice($chunks, 1, 5) as $chunk) {
|
||||
if ($referenceChunk->embedding && $chunk->embedding) {
|
||||
$similarity = cosineSimilarity(
|
||||
$referenceChunk->embedding,
|
||||
$chunk->embedding
|
||||
);
|
||||
|
||||
echo "Chunk 0 vs Chunk {$chunk->metadata->chunkIndex}: ";
|
||||
echo number_format($similarity, 4) . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
class SimpleVectorDB
|
||||
{
|
||||
private array $vectors = [];
|
||||
|
||||
public function add(string $id, array $embedding, string $content): void
|
||||
{
|
||||
$this->vectors[$id] = [
|
||||
'embedding' => $embedding,
|
||||
'content' => $content,
|
||||
];
|
||||
}
|
||||
|
||||
public function search(array $queryEmbedding, int $k = 5): array
|
||||
{
|
||||
$results = [];
|
||||
|
||||
foreach ($this->vectors as $id => $data) {
|
||||
$similarity = $this->cosineSimilarity($queryEmbedding, $data['embedding']);
|
||||
$results[] = [
|
||||
'id' => $id,
|
||||
'similarity' => $similarity,
|
||||
'content' => $data['content'],
|
||||
];
|
||||
}
|
||||
|
||||
usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
|
||||
|
||||
return array_slice($results, 0, $k);
|
||||
}
|
||||
|
||||
private function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = 0.0;
|
||||
$magA = 0.0;
|
||||
$magB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magA += $a[$i] * $a[$i];
|
||||
$magB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magA) * sqrt($magB));
|
||||
}
|
||||
}
|
||||
|
||||
$db = new SimpleVectorDB();
|
||||
|
||||
$files = ['doc1.pdf', 'doc2.pdf', 'doc3.pdf'];
|
||||
foreach ($files as $file) {
|
||||
if (!file_exists($file)) continue;
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding) {
|
||||
$id = $file . '_chunk_' . $chunk->metadata->chunkIndex;
|
||||
$db->add($id, $chunk->embedding, $chunk->content);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Vector database built\n";
|
||||
echo "Ready for semantic search!\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(maxChunkSize: 512),
|
||||
embedding: new EmbeddingConfig(model: 'all-MiniLM-L6-v2', normalize: true)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('export_doc.pdf');
|
||||
|
||||
$exportData = [];
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
$exportData[] = [
|
||||
'id' => uniqid('vec_', true),
|
||||
'text' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'chunk_index' => $chunk->metadata->chunkIndex,
|
||||
'source' => 'export_doc.pdf',
|
||||
'timestamp' => time(),
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
file_put_contents('embeddings_export.json', json_encode($exportData));
|
||||
echo "\nExported " . count($exportData) . " embeddings to embeddings_export.json\n";
|
||||
```
|
||||
221
docs/snippets/php/embeddings/semantic_search.php
Normal file
221
docs/snippets/php/embeddings/semantic_search.php
Normal file
@@ -0,0 +1,221 @@
|
||||
```php title="semantic_search.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Semantic Search with Embeddings
|
||||
*
|
||||
* Build a semantic search system using document embeddings.
|
||||
* Find relevant content based on meaning, not just keywords.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ChunkingConfig;
|
||||
use Kreuzberg\Config\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxChunkSize: 512,
|
||||
chunkOverlap: 50,
|
||||
respectSentences: true
|
||||
),
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'all-MiniLM-L6-v2',
|
||||
normalize: true
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
|
||||
echo "Building document index...\n";
|
||||
$documentIndex = [];
|
||||
|
||||
$files = glob('knowledge_base/*.pdf');
|
||||
foreach ($files as $file) {
|
||||
echo "Indexing: " . basename($file) . "\n";
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
foreach ($result->chunks ?? [] as $chunk) {
|
||||
if ($chunk->embedding) {
|
||||
$documentIndex[] = [
|
||||
'file' => basename($file),
|
||||
'chunk_index' => $chunk->metadata->chunkIndex,
|
||||
'content' => $chunk->content,
|
||||
'embedding' => $chunk->embedding,
|
||||
'metadata' => [
|
||||
'title' => $result->metadata->title ?? basename($file),
|
||||
'author' => $result->metadata->author ?? 'Unknown',
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "Indexed " . count($documentIndex) . " chunks from " . count($files) . " documents\n\n";
|
||||
|
||||
function semanticSearch(array $index, array $queryEmbedding, int $topK = 5): array
|
||||
{
|
||||
$results = [];
|
||||
|
||||
foreach ($index as $item) {
|
||||
$similarity = cosineSimilarity($queryEmbedding, $item['embedding']);
|
||||
$results[] = array_merge($item, ['similarity' => $similarity]);
|
||||
}
|
||||
|
||||
usort($results, fn($a, $b) => $b['similarity'] <=> $a['similarity']);
|
||||
|
||||
return array_slice($results, 0, $topK);
|
||||
}
|
||||
|
||||
function cosineSimilarity(array $a, array $b): float
|
||||
{
|
||||
$dotProduct = $magnitudeA = $magnitudeB = 0.0;
|
||||
|
||||
for ($i = 0; $i < count($a); $i++) {
|
||||
$dotProduct += $a[$i] * $b[$i];
|
||||
$magnitudeA += $a[$i] * $a[$i];
|
||||
$magnitudeB += $b[$i] * $b[$i];
|
||||
}
|
||||
|
||||
return $dotProduct / (sqrt($magnitudeA) * sqrt($magnitudeB));
|
||||
}
|
||||
|
||||
function getQueryEmbedding(Kreuzberg $kreuzberg, string $query): ?array
|
||||
{
|
||||
$tempFile = tempnam(sys_get_temp_dir(), 'query_');
|
||||
file_put_contents($tempFile, $query);
|
||||
|
||||
try {
|
||||
$result = $kreuzberg->extractFile($tempFile);
|
||||
$chunk = ($result->chunks ?? [])[0] ?? null;
|
||||
return $chunk?->embedding;
|
||||
} finally {
|
||||
unlink($tempFile);
|
||||
}
|
||||
}
|
||||
|
||||
$queries = [
|
||||
"What are the key features of the product?",
|
||||
"How do I install and configure the system?",
|
||||
"What are the pricing options?",
|
||||
"How does authentication work?",
|
||||
"What are the performance benchmarks?",
|
||||
];
|
||||
|
||||
foreach ($queries as $query) {
|
||||
echo "Query: \"$query\"\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $query);
|
||||
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($documentIndex, $queryEmbedding, 3);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo "\nResult " . ($index + 1) . " (similarity: " .
|
||||
number_format($result['similarity'], 4) . "):\n";
|
||||
echo "File: {$result['file']}\n";
|
||||
echo "Title: {$result['metadata']['title']}\n";
|
||||
echo "Content: " . substr($result['content'], 0, 200) . "...\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n" . str_repeat('-', 60) . "\n\n";
|
||||
}
|
||||
|
||||
function buildRAGContext(array $searchResults, int $maxTokens = 2000): string
|
||||
{
|
||||
$context = "Relevant context:\n\n";
|
||||
$currentTokens = 0;
|
||||
|
||||
foreach ($searchResults as $result) {
|
||||
$tokens = strlen($result['content']) / 4;
|
||||
|
||||
if ($currentTokens + $tokens > $maxTokens) {
|
||||
break;
|
||||
}
|
||||
|
||||
$context .= "From {$result['file']}:\n";
|
||||
$context .= $result['content'] . "\n\n";
|
||||
$currentTokens += $tokens;
|
||||
}
|
||||
|
||||
return $context;
|
||||
}
|
||||
|
||||
$userQuestion = "How do I optimize performance?";
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $userQuestion);
|
||||
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($documentIndex, $queryEmbedding, 5);
|
||||
$context = buildRAGContext($results);
|
||||
|
||||
echo "RAG Context for: \"$userQuestion\"\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo $context;
|
||||
echo "\nContext ready for LLM prompt!\n";
|
||||
}
|
||||
|
||||
file_put_contents(
|
||||
'document_index.json',
|
||||
json_encode($documentIndex, JSON_PRETTY_PRINT)
|
||||
);
|
||||
echo "\nSaved document index to: document_index.json\n";
|
||||
|
||||
function multiQuerySearch(array $index, array $queries, Kreuzberg $kreuzberg): array
|
||||
{
|
||||
$allResults = [];
|
||||
|
||||
foreach ($queries as $query) {
|
||||
$queryEmbedding = getQueryEmbedding($kreuzberg, $query);
|
||||
if ($queryEmbedding) {
|
||||
$results = semanticSearch($index, $queryEmbedding, 10);
|
||||
$allResults = array_merge($allResults, $results);
|
||||
}
|
||||
}
|
||||
|
||||
$grouped = [];
|
||||
foreach ($allResults as $result) {
|
||||
$key = $result['file'] . '_' . $result['chunk_index'];
|
||||
if (!isset($grouped[$key])) {
|
||||
$grouped[$key] = [
|
||||
'result' => $result,
|
||||
'similarities' => [],
|
||||
];
|
||||
}
|
||||
$grouped[$key]['similarities'][] = $result['similarity'];
|
||||
}
|
||||
|
||||
$final = [];
|
||||
foreach ($grouped as $data) {
|
||||
$avgSimilarity = array_sum($data['similarities']) / count($data['similarities']);
|
||||
$final[] = array_merge($data['result'], ['avg_similarity' => $avgSimilarity]);
|
||||
}
|
||||
|
||||
usort($final, fn($a, $b) => $b['avg_similarity'] <=> $a['avg_similarity']);
|
||||
|
||||
return array_slice($final, 0, 5);
|
||||
}
|
||||
|
||||
$relatedQueries = [
|
||||
"system requirements",
|
||||
"installation steps",
|
||||
"getting started guide",
|
||||
];
|
||||
|
||||
echo "\nMulti-query search results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$results = multiQuerySearch($documentIndex, $relatedQueries, $kreuzberg);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
echo "\n" . ($index + 1) . ". {$result['file']}\n";
|
||||
echo " Average similarity: " . number_format($result['avg_similarity'], 4) . "\n";
|
||||
echo " " . substr($result['content'], 0, 150) . "...\n";
|
||||
}
|
||||
```
|
||||
154
docs/snippets/php/extraction/batch_processing.php
Normal file
154
docs/snippets/php/extraction/batch_processing.php
Normal file
@@ -0,0 +1,154 @@
|
||||
```php title="batch_processing.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Batch Document Processing
|
||||
*
|
||||
* Process multiple documents in parallel for maximum performance.
|
||||
* Kreuzberg's batch API uses multiple threads to extract documents concurrently.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
use function Kreuzberg\batch_extract_bytes;
|
||||
|
||||
$files = [
|
||||
'document1.pdf',
|
||||
'document2.docx',
|
||||
'document3.xlsx',
|
||||
'presentation.pptx',
|
||||
];
|
||||
|
||||
$files = array_filter($files, 'file_exists');
|
||||
|
||||
if (!empty($files)) {
|
||||
echo "Processing " . count($files) . " files in batch...\n\n";
|
||||
|
||||
$start = microtime(true);
|
||||
$results = batch_extract_files($files);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "Batch extraction completed in " . number_format($elapsed, 3) . " seconds\n";
|
||||
echo "Average: " . number_format($elapsed / count($files), 3) . " seconds per file\n\n";
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
$filename = basename($files[$index]);
|
||||
echo "$filename:\n";
|
||||
echo " Content: " . strlen($result->content) . " chars\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " MIME: " . $result->mimeType . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: false
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
|
||||
$pdfFiles = glob('*.pdf');
|
||||
if (!empty($pdfFiles)) {
|
||||
echo "Processing " . count($pdfFiles) . " PDF files...\n";
|
||||
|
||||
$start = microtime(true);
|
||||
$results = $kreuzberg->batchExtractFiles($pdfFiles, $config);
|
||||
$elapsed = microtime(true) - $start;
|
||||
|
||||
echo "Completed in " . number_format($elapsed, 2) . " seconds\n";
|
||||
echo "Throughput: " . number_format(count($pdfFiles) / $elapsed, 2) . " files/second\n\n";
|
||||
|
||||
$totalChars = 0;
|
||||
$totalTables = 0;
|
||||
|
||||
foreach ($results as $result) {
|
||||
$totalChars += strlen($result->content);
|
||||
$totalTables += count($result->tables);
|
||||
}
|
||||
|
||||
echo "Total content: " . number_format($totalChars) . " characters\n";
|
||||
echo "Total tables: $totalTables\n";
|
||||
}
|
||||
|
||||
$uploadedFiles = [
|
||||
['data' => file_get_contents('file1.pdf'), 'mime' => 'application/pdf'],
|
||||
['data' => file_get_contents('file2.docx'), 'mime' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'],
|
||||
];
|
||||
|
||||
$dataList = array_column($uploadedFiles, 'data');
|
||||
$mimeTypes = array_column($uploadedFiles, 'mime');
|
||||
|
||||
$results = batch_extract_bytes($dataList, $mimeTypes);
|
||||
|
||||
echo "\nProcessed " . count($results) . " files from memory\n";
|
||||
|
||||
function processDirectory(string $dir, Kreuzberg $kreuzberg): array
|
||||
{
|
||||
$results = [];
|
||||
$iterator = new RecursiveIteratorIterator(
|
||||
new RecursiveDirectoryIterator($dir)
|
||||
);
|
||||
|
||||
$files = [];
|
||||
foreach ($iterator as $file) {
|
||||
if ($file->isFile()) {
|
||||
$ext = strtolower($file->getExtension());
|
||||
if (in_array($ext, ['pdf', 'docx', 'xlsx', 'pptx', 'txt'], true)) {
|
||||
$files[] = $file->getPathname();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($files)) {
|
||||
return $results;
|
||||
}
|
||||
|
||||
$batches = array_chunk($files, 10);
|
||||
|
||||
foreach ($batches as $batchIndex => $batch) {
|
||||
echo "Processing batch " . ($batchIndex + 1) . "/" . count($batches) . "...\n";
|
||||
$batchResults = $kreuzberg->batchExtractFiles($batch);
|
||||
$results = array_merge($results, $batchResults);
|
||||
}
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
$directory = './documents';
|
||||
if (is_dir($directory)) {
|
||||
echo "\nProcessing directory: $directory\n";
|
||||
$results = processDirectory($directory, $kreuzberg);
|
||||
echo "Processed " . count($results) . " files\n";
|
||||
}
|
||||
|
||||
$mixedFiles = ['valid.pdf', 'nonexistent.pdf', 'another.docx'];
|
||||
|
||||
try {
|
||||
$results = batch_extract_files($mixedFiles);
|
||||
} catch (\Kreuzberg\Exceptions\KreuzbergException $e) {
|
||||
echo "Batch processing error: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
$allFiles = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
|
||||
$batchSize = 5;
|
||||
$batches = array_chunk($allFiles, $batchSize);
|
||||
$totalProcessed = 0;
|
||||
|
||||
echo "\nProcessing " . count($allFiles) . " files in " . count($batches) . " batches...\n";
|
||||
|
||||
foreach ($batches as $index => $batch) {
|
||||
$progress = (($index + 1) / count($batches)) * 100;
|
||||
echo sprintf("\rProgress: %.1f%% [%d/%d batches]",
|
||||
$progress, $index + 1, count($batches));
|
||||
|
||||
$results = $kreuzberg->batchExtractFiles($batch);
|
||||
$totalProcessed += count($results);
|
||||
}
|
||||
|
||||
echo "\n\nCompleted! Processed $totalProcessed files.\n";
|
||||
```
|
||||
118
docs/snippets/php/extraction/docx_extraction.php
Normal file
118
docs/snippets/php/extraction/docx_extraction.php
Normal file
@@ -0,0 +1,118 @@
|
||||
```php title="docx_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* DOCX (Word) Document Extraction
|
||||
*
|
||||
* Extract text, tables, and metadata from Microsoft Word documents.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.docx');
|
||||
|
||||
echo "Word Document Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Document Metadata:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "Subject: " . ($result->metadata->subject ?? 'N/A') . "\n";
|
||||
echo "Keywords: " . implode(', ', $result->metadata->keywords ?? []) . "\n\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('report.docx');
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . ":\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
echo implode(' | ', $row) . "\n";
|
||||
if ($rowIndex === 0) {
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
}
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$conversions = [
|
||||
'plain' => null,
|
||||
'markdown' => 'markdown',
|
||||
];
|
||||
|
||||
foreach ($conversions as $name => $format) {
|
||||
$config = new ExtractionConfig(
|
||||
outputFormat: $format,
|
||||
preserveFormatting: $format !== null
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('document.docx');
|
||||
|
||||
$outputFile = "output_$name.txt";
|
||||
file_put_contents($outputFile, $result->content);
|
||||
echo "Saved $name format to: $outputFile\n";
|
||||
}
|
||||
|
||||
use function Kreuzberg\batch_extract_files;
|
||||
|
||||
$docxFiles = glob('*.docx');
|
||||
if (!empty($docxFiles)) {
|
||||
echo "\nBatch processing " . count($docxFiles) . " DOCX files...\n";
|
||||
|
||||
$results = batch_extract_files($docxFiles);
|
||||
|
||||
foreach ($results as $index => $result) {
|
||||
$filename = basename($docxFiles[$index]);
|
||||
echo "\n$filename:\n";
|
||||
echo " Characters: " . strlen($result->content) . "\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " Authors: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown') . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
$result = extract_file('reviewed_document.docx');
|
||||
|
||||
if (!empty($result->metadata->createdBy)) {
|
||||
echo "\nDocument Information:\n";
|
||||
echo "Created by: " . $result->metadata->createdBy . "\n";
|
||||
}
|
||||
|
||||
if (!empty($result->metadata->producer)) {
|
||||
echo "Producer: " . $result->metadata->producer . "\n";
|
||||
}
|
||||
|
||||
$result = extract_file('document.docx');
|
||||
$content = $result->content;
|
||||
|
||||
$stats = [
|
||||
'characters' => mb_strlen($content),
|
||||
'words' => str_word_count($content),
|
||||
'lines' => substr_count($content, "\n"),
|
||||
'paragraphs' => substr_count($content, "\n\n"),
|
||||
'sentences' => preg_match_all('/[.!?]+/', $content),
|
||||
];
|
||||
|
||||
echo "\nDocument Statistics:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($stats as $metric => $value) {
|
||||
echo ucfirst($metric) . ": " . number_format($value) . "\n";
|
||||
}
|
||||
```
|
||||
288
docs/snippets/php/extraction/excel_extraction.php
Normal file
288
docs/snippets/php/extraction/excel_extraction.php
Normal file
@@ -0,0 +1,288 @@
|
||||
```php title="excel_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Excel Spreadsheet Extraction
|
||||
*
|
||||
* This example demonstrates extracting content from Excel files (.xlsx, .xls).
|
||||
* Excel spreadsheets are automatically converted to tables and text.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
|
||||
echo "Example 1: Basic Excel Extraction\n";
|
||||
echo "=================================\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('financial_report.xlsx');
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Metadata:\n";
|
||||
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Created: " . ($result->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "- Modified: " . ($result->metadata->modifiedAt ?? 'N/A') . "\n\n";
|
||||
|
||||
echo "Example 2: Extract Excel Tables\n";
|
||||
echo "===============================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('data.xlsx');
|
||||
|
||||
if (count($result2->tables) > 0) {
|
||||
echo "Found " . count($result2->tables) . " table(s)\n\n";
|
||||
|
||||
foreach ($result2->tables as $i => $table) {
|
||||
echo "Table " . ($i + 1) . " (Sheet/Page {$table->pageNumber}):\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
echo "Raw data:\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 3: Convert Excel to CSV\n";
|
||||
echo "===============================\n";
|
||||
|
||||
$result3 = $kreuzberg->extractFile('spreadsheet.xlsx');
|
||||
|
||||
foreach ($result3->tables as $i => $table) {
|
||||
$csvFilename = "sheet_{$i}.csv";
|
||||
$fp = fopen($csvFilename, 'w');
|
||||
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
echo "Saved: {$csvFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 4: Convert Excel to JSON\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result4 = $kreuzberg->extractFile('data.xlsx');
|
||||
|
||||
foreach ($result4->tables as $i => $table) {
|
||||
$jsonData = [];
|
||||
|
||||
if (count($table->cells) > 0) {
|
||||
$headers = $table->cells[0];
|
||||
|
||||
for ($j = 1; $j < count($table->cells); $j++) {
|
||||
$row = $table->cells[$j];
|
||||
$rowData = [];
|
||||
|
||||
for ($k = 0; $k < count($headers); $k++) {
|
||||
$header = $headers[$k];
|
||||
$value = $row[$k] ?? '';
|
||||
$rowData[$header] = $value;
|
||||
}
|
||||
|
||||
$jsonData[] = $rowData;
|
||||
}
|
||||
}
|
||||
|
||||
$jsonFilename = "sheet_{$i}.json";
|
||||
file_put_contents($jsonFilename, json_encode($jsonData, JSON_PRETTY_PRINT));
|
||||
echo "Saved: {$jsonFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 5: Process Multiple Sheets\n";
|
||||
echo "==================================\n";
|
||||
|
||||
$result5 = $kreuzberg->extractFile('multi_sheet_workbook.xlsx');
|
||||
|
||||
echo "Total sheets/tables: " . count($result5->tables) . "\n\n";
|
||||
|
||||
foreach ($result5->tables as $i => $table) {
|
||||
echo "Sheet " . ($i + 1) . ":\n";
|
||||
echo "- Rows: " . count($table->cells) . "\n";
|
||||
echo "- Columns: " . (count($table->cells) > 0 ? count($table->cells[0]) : 0) . "\n";
|
||||
|
||||
if (count($table->cells) > 1) {
|
||||
$numericColumns = [];
|
||||
|
||||
for ($col = 0; $col < count($table->cells[0]); $col++) {
|
||||
$isNumeric = true;
|
||||
|
||||
for ($row = 1; $row < count($table->cells); $row++) {
|
||||
$value = $table->cells[$row][$col] ?? '';
|
||||
if (!is_numeric(trim($value)) && trim($value) !== '') {
|
||||
$isNumeric = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ($isNumeric) {
|
||||
$numericColumns[] = $col;
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($numericColumns)) {
|
||||
echo "- Numeric columns: " . count($numericColumns) . "\n";
|
||||
|
||||
$col = $numericColumns[0];
|
||||
$sum = 0;
|
||||
for ($row = 1; $row < count($table->cells); $row++) {
|
||||
$value = $table->cells[$row][$col] ?? '0';
|
||||
$sum += (float) $value;
|
||||
}
|
||||
|
||||
$columnName = $table->cells[0][$col] ?? "Column {$col}";
|
||||
echo "- Sum of '{$columnName}': {$sum}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
echo "Example 6: Extract Specific Data\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result6 = $kreuzberg->extractFile('budget.xlsx');
|
||||
|
||||
if (count($result6->tables) > 0) {
|
||||
$table = $result6->tables[0];
|
||||
|
||||
echo "Header row:\n";
|
||||
if (count($table->cells) > 0) {
|
||||
print_r($table->cells[0]);
|
||||
}
|
||||
|
||||
echo "\nFirst data row:\n";
|
||||
if (count($table->cells) > 1) {
|
||||
print_r($table->cells[1]);
|
||||
}
|
||||
|
||||
if (count($table->cells) > 1 && count($table->cells[1]) > 2) {
|
||||
$cellValue = $table->cells[1][2];
|
||||
echo "\nCell [1][2]: {$cellValue}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 7: Batch Process Excel Files\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$excelFiles = [
|
||||
'january_sales.xlsx',
|
||||
'february_sales.xlsx',
|
||||
'march_sales.xlsx',
|
||||
];
|
||||
|
||||
$results = $kreuzberg->batchExtractFiles($excelFiles);
|
||||
|
||||
$totalSheets = 0;
|
||||
foreach ($results as $i => $result) {
|
||||
$sheetCount = count($result->tables);
|
||||
$totalSheets += $sheetCount;
|
||||
|
||||
echo "{$excelFiles[$i]}:\n";
|
||||
echo "- Sheets: {$sheetCount}\n";
|
||||
echo "- Text length: " . strlen($result->content) . " characters\n\n";
|
||||
}
|
||||
|
||||
echo "Total sheets across all files: {$totalSheets}\n\n";
|
||||
|
||||
echo "Example 8: Convert Excel to HTML\n";
|
||||
echo "================================\n";
|
||||
|
||||
$result8 = $kreuzberg->extractFile('report.xlsx');
|
||||
|
||||
foreach ($result8->tables as $i => $table) {
|
||||
$html = "<table border='1'>\n";
|
||||
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
$html .= " <tr>\n";
|
||||
|
||||
$tag = $rowIndex === 0 ? 'th' : 'td';
|
||||
|
||||
foreach ($row as $cell) {
|
||||
$escapedCell = htmlspecialchars($cell);
|
||||
$html .= " <{$tag}>{$escapedCell}</{$tag}>\n";
|
||||
}
|
||||
|
||||
$html .= " </tr>\n";
|
||||
}
|
||||
|
||||
$html .= "</table>\n";
|
||||
|
||||
$htmlFilename = "sheet_{$i}.html";
|
||||
file_put_contents($htmlFilename, $html);
|
||||
echo "Saved: {$htmlFilename}\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 9: Excel Metadata Extraction\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$result9 = $kreuzberg->extractFile('workbook.xlsx');
|
||||
|
||||
echo "File Metadata:\n";
|
||||
echo "- Title: " . ($result9->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Subject: " . ($result9->metadata->subject ?? 'N/A') . "\n";
|
||||
echo "- Authors: " . (isset($result9->metadata->authors) ? implode(', ', $result9->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Created: " . ($result9->metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "- Modified: " . ($result9->metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "- Created By: " . ($result9->metadata->createdBy ?? 'N/A') . "\n";
|
||||
echo "- Keywords: " . (isset($result9->metadata->keywords) ? implode(', ', $result9->metadata->keywords) : 'N/A') . "\n";
|
||||
|
||||
if (!empty($result9->metadata->custom)) {
|
||||
echo "\nCustom Properties:\n";
|
||||
foreach ($result9->metadata->custom as $key => $value) {
|
||||
echo "- {$key}: {$value}\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 10: Error Handling\n";
|
||||
echo "=========================\n";
|
||||
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
|
||||
try {
|
||||
$result = $kreuzberg->extractFile('protected.xlsx');
|
||||
echo "Success: Extracted " . count($result->tables) . " sheets\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Error: {$e->getMessage()}\n";
|
||||
echo "Note: Password-protected files may require special handling\n";
|
||||
}
|
||||
|
||||
echo "\n\nSupported Excel Formats:\n";
|
||||
echo "========================\n";
|
||||
echo "- .xlsx (Office Open XML)\n";
|
||||
echo "- .xls (Legacy Excel format)\n";
|
||||
echo "- .xlsm (Macro-enabled)\n";
|
||||
echo "- .xlsb (Binary workbook)\n";
|
||||
echo "- .xltx (Template)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Excel tables are automatically detected as Table objects\n";
|
||||
echo "2. Each sheet becomes a separate table\n";
|
||||
echo "3. Use table->cells for programmatic access to cell data\n";
|
||||
echo "4. Use table->markdown for human-readable output\n";
|
||||
echo "5. First row is often headers - handle accordingly\n";
|
||||
echo "6. Check for numeric columns to perform calculations\n";
|
||||
echo "7. Export to CSV/JSON for database import\n";
|
||||
echo "8. Use batch processing for multiple Excel files\n";
|
||||
```
|
||||
159
docs/snippets/php/extraction/image_extraction.php
Normal file
159
docs/snippets/php/extraction/image_extraction.php
Normal file
@@ -0,0 +1,159 @@
|
||||
```php title="image_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Image Extraction from Documents
|
||||
*
|
||||
* Extract embedded images from PDFs, Office documents, and other formats.
|
||||
* Optionally perform OCR on extracted images.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100
|
||||
),
|
||||
extractImages: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('presentation.pptx');
|
||||
|
||||
echo "Image Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Images found: " . count($result->images ?? []) . "\n\n";
|
||||
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = sprintf(
|
||||
'extracted_p%d_i%d_%dx%d.%s',
|
||||
$image->pageNumber,
|
||||
$image->imageIndex,
|
||||
$image->width,
|
||||
$image->height,
|
||||
$image->format
|
||||
);
|
||||
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename\n";
|
||||
echo " Size: {$image->width}x{$image->height} pixels\n";
|
||||
echo " Format: {$image->format}\n";
|
||||
echo " Data: " . number_format(strlen($image->data)) . " bytes\n\n";
|
||||
}
|
||||
|
||||
$ocrConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
performOcr: true,
|
||||
minWidth: 200,
|
||||
minHeight: 100
|
||||
),
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($ocrConfig);
|
||||
$result = $kreuzberg->extractFile('scanned_images.pdf');
|
||||
|
||||
echo "Images with OCR:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
echo "Image {$image->imageIndex} from page {$image->pageNumber}:\n";
|
||||
|
||||
if ($image->ocrResult !== null) {
|
||||
echo " OCR Text: " . substr($image->ocrResult->content, 0, 100) . "...\n";
|
||||
echo " OCR Length: " . strlen($image->ocrResult->content) . " chars\n";
|
||||
} else {
|
||||
echo " No OCR result\n";
|
||||
}
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$largeImageConfig = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 500,
|
||||
minHeight: 500
|
||||
),
|
||||
extractImages: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($largeImageConfig);
|
||||
$result = $kreuzberg->extractFile('photo_album.pdf');
|
||||
|
||||
echo "Large images (>500x500):\n";
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = "large_image_{$image->imageIndex}.{$image->format}";
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename ({$image->width}x{$image->height})\n";
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
$imageTypes = [];
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
if (!isset($imageTypes[$image->format])) {
|
||||
$imageTypes[$image->format] = [];
|
||||
}
|
||||
$imageTypes[$image->format][] = $image;
|
||||
}
|
||||
|
||||
echo "\nImages by format:\n";
|
||||
foreach ($imageTypes as $format => $images) {
|
||||
echo " $format: " . count($images) . " images\n";
|
||||
|
||||
$dir = "images_$format";
|
||||
if (!is_dir($dir)) {
|
||||
mkdir($dir, 0755, true);
|
||||
}
|
||||
|
||||
foreach ($images as $index => $image) {
|
||||
$filename = "$dir/image_$index.$format";
|
||||
file_put_contents($filename, $image->data);
|
||||
}
|
||||
echo " Saved to: $dir/\n";
|
||||
}
|
||||
|
||||
if (extension_loaded('gd')) {
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
if ($image->format === 'png' || $image->format === 'jpg') {
|
||||
$gdImage = imagecreatefromstring($image->data);
|
||||
|
||||
if ($gdImage !== false) {
|
||||
$width = imagesx($gdImage);
|
||||
$height = imagesy($gdImage);
|
||||
$thumbWidth = 200;
|
||||
$thumbHeight = (int)(($height / $width) * $thumbWidth);
|
||||
|
||||
$thumb = imagecreatetruecolor($thumbWidth, $thumbHeight);
|
||||
imagecopyresampled($thumb, $gdImage, 0, 0, 0, 0,
|
||||
$thumbWidth, $thumbHeight, $width, $height);
|
||||
|
||||
$thumbFile = "thumb_{$image->imageIndex}.{$image->format}";
|
||||
if ($image->format === 'png') {
|
||||
imagepng($thumb, $thumbFile);
|
||||
} else {
|
||||
imagejpeg($thumb, $thumbFile, 85);
|
||||
}
|
||||
|
||||
echo "Created thumbnail: $thumbFile\n";
|
||||
|
||||
imagedestroy($gdImage);
|
||||
imagedestroy($thumb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
196
docs/snippets/php/extraction/metadata_extraction.php
Normal file
196
docs/snippets/php/extraction/metadata_extraction.php
Normal file
@@ -0,0 +1,196 @@
|
||||
```php title="metadata_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Metadata Extraction
|
||||
*
|
||||
* Extract and process document metadata including title, author,
|
||||
* creation date, keywords, and custom properties.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
$metadata = $result->metadata;
|
||||
|
||||
echo "Document Metadata:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Title: " . ($metadata->title ?? 'N/A') . "\n";
|
||||
echo "Authors: " . (isset($metadata->authors) ? implode(', ', $metadata->authors) : 'N/A') . "\n";
|
||||
echo "Subject: " . ($metadata->subject ?? 'N/A') . "\n";
|
||||
echo "Created By: " . ($metadata->createdBy ?? 'N/A') . "\n";
|
||||
echo "Producer: " . ($metadata->producer ?? 'N/A') . "\n";
|
||||
echo "Created: " . ($metadata->createdAt ?? 'N/A') . "\n";
|
||||
echo "Modified: " . ($metadata->modifiedAt ?? 'N/A') . "\n";
|
||||
echo "Page Count: " . ($metadata->pageCount ?? 'N/A') . "\n";
|
||||
echo "Keywords: " . implode(', ', $metadata->keywords ?? []) . "\n";
|
||||
echo "Language: " . ($metadata->language ?? 'N/A') . "\n\n";
|
||||
|
||||
$files = glob('documents/*.{pdf,docx,xlsx}', GLOB_BRACE);
|
||||
$metadataCollection = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
$result = extract_file($file);
|
||||
$metadataCollection[] = [
|
||||
'file' => basename($file),
|
||||
'title' => $result->metadata->title ?? 'Untitled',
|
||||
'author' => isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'Unknown',
|
||||
'created' => $result->metadata->createdAt ?? 'Unknown',
|
||||
'pages' => $result->metadata->pageCount ?? 0,
|
||||
'size' => filesize($file),
|
||||
];
|
||||
}
|
||||
|
||||
echo "Metadata Collection:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($metadataCollection as $meta) {
|
||||
echo "{$meta['file']}:\n";
|
||||
echo " Title: {$meta['title']}\n";
|
||||
echo " Author: {$meta['author']}\n";
|
||||
echo " Created: {$meta['created']}\n";
|
||||
echo " Pages: {$meta['pages']}\n";
|
||||
echo " Size: " . number_format($meta['size'] / 1024, 2) . " KB\n\n";
|
||||
}
|
||||
|
||||
function searchByAuthor(array $collection, string $author): array
|
||||
{
|
||||
return array_filter($collection, function ($meta) use ($author) {
|
||||
return stripos($meta['author'], $author) !== false;
|
||||
});
|
||||
}
|
||||
|
||||
function searchByDateRange(array $collection, string $start, string $end): array
|
||||
{
|
||||
return array_filter($collection, function ($meta) use ($start, $end) {
|
||||
$created = $meta['created'];
|
||||
if ($created === 'Unknown') {
|
||||
return false;
|
||||
}
|
||||
$dateOnly = substr($created, 0, 10);
|
||||
return $dateOnly >= $start && $dateOnly <= $end;
|
||||
});
|
||||
}
|
||||
|
||||
$johnDocs = searchByAuthor($metadataCollection, 'John');
|
||||
echo "Documents by John: " . count($johnDocs) . "\n";
|
||||
|
||||
$recentDocs = searchByDateRange($metadataCollection, '2024-01-01', '2024-12-31');
|
||||
echo "Documents from 2024: " . count($recentDocs) . "\n\n";
|
||||
|
||||
function generateCatalog(array $collection): string
|
||||
{
|
||||
$html = "<html><head><title>Document Catalog</title></head><body>\n";
|
||||
$html .= "<h1>Document Catalog</h1>\n";
|
||||
$html .= "<table border='1'>\n";
|
||||
$html .= "<tr><th>File</th><th>Title</th><th>Author</th><th>Created</th><th>Pages</th></tr>\n";
|
||||
|
||||
foreach ($collection as $meta) {
|
||||
$html .= "<tr>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['file']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['title']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['author']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars($meta['created']) . "</td>";
|
||||
$html .= "<td>" . htmlspecialchars((string)$meta['pages']) . "</td>";
|
||||
$html .= "</tr>\n";
|
||||
}
|
||||
|
||||
$html .= "</table>\n</body></html>";
|
||||
return $html;
|
||||
}
|
||||
|
||||
$catalog = generateCatalog($metadataCollection);
|
||||
file_put_contents('catalog.html', $catalog);
|
||||
echo "Catalog saved to: catalog.html\n";
|
||||
|
||||
function exportMetadataToCSV(array $collection, string $filename): void
|
||||
{
|
||||
$fp = fopen($filename, 'w');
|
||||
|
||||
fputcsv($fp, ['File', 'Title', 'Author', 'Created', 'Pages', 'Size (KB)']);
|
||||
|
||||
foreach ($collection as $meta) {
|
||||
fputcsv($fp, [
|
||||
$meta['file'],
|
||||
$meta['title'],
|
||||
$meta['author'],
|
||||
$meta['created'],
|
||||
$meta['pages'],
|
||||
number_format($meta['size'] / 1024, 2),
|
||||
]);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
}
|
||||
|
||||
exportMetadataToCSV($metadataCollection, 'metadata.csv');
|
||||
echo "Metadata exported to: metadata.csv\n";
|
||||
|
||||
$totalPages = array_sum(array_column($metadataCollection, 'pages'));
|
||||
$totalSize = array_sum(array_column($metadataCollection, 'size'));
|
||||
$authors = array_unique(array_column($metadataCollection, 'author'));
|
||||
|
||||
echo "\nCollection Statistics:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Total documents: " . count($metadataCollection) . "\n";
|
||||
echo "Total pages: " . number_format($totalPages) . "\n";
|
||||
echo "Total size: " . number_format($totalSize / 1024 / 1024, 2) . " MB\n";
|
||||
echo "Unique authors: " . count($authors) . "\n";
|
||||
echo "Average pages per document: " . number_format($totalPages / count($metadataCollection), 1) . "\n";
|
||||
|
||||
$byAuthor = [];
|
||||
foreach ($metadataCollection as $meta) {
|
||||
$author = $meta['author'];
|
||||
if (!isset($byAuthor[$author])) {
|
||||
$byAuthor[$author] = [];
|
||||
}
|
||||
$byAuthor[$author][] = $meta;
|
||||
}
|
||||
|
||||
echo "\nDocuments by Author:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($byAuthor as $author => $docs) {
|
||||
echo "$author: " . count($docs) . " documents\n";
|
||||
}
|
||||
|
||||
function validateMetadata(array $meta): array
|
||||
{
|
||||
$issues = [];
|
||||
|
||||
if (empty($meta['title']) || $meta['title'] === 'Untitled') {
|
||||
$issues[] = 'Missing title';
|
||||
}
|
||||
|
||||
if (empty($meta['author']) || $meta['author'] === 'Unknown') {
|
||||
$issues[] = 'Missing author';
|
||||
}
|
||||
|
||||
if (empty($meta['created']) || $meta['created'] === 'Unknown') {
|
||||
$issues[] = 'Missing creation date';
|
||||
}
|
||||
|
||||
if ($meta['pages'] === 0) {
|
||||
$issues[] = 'Invalid page count';
|
||||
}
|
||||
|
||||
return $issues;
|
||||
}
|
||||
|
||||
echo "\nMetadata Quality Check:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
$incomplete = 0;
|
||||
foreach ($metadataCollection as $meta) {
|
||||
$issues = validateMetadata($meta);
|
||||
if (!empty($issues)) {
|
||||
$incomplete++;
|
||||
echo "{$meta['file']}: " . implode(', ', $issues) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nIncomplete metadata: $incomplete/" . count($metadataCollection) . " documents\n";
|
||||
```
|
||||
282
docs/snippets/php/extraction/multi_format.php
Normal file
282
docs/snippets/php/extraction/multi_format.php
Normal file
@@ -0,0 +1,282 @@
|
||||
```php title="multi_format.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Multi-Format Document Extraction
|
||||
*
|
||||
* Handle various document formats (PDF, DOCX, XLSX, PPTX, images, etc.)
|
||||
* with format-specific processing and unified output.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use function Kreuzberg\extract_file;
|
||||
use function Kreuzberg\detect_mime_type_from_path;
|
||||
|
||||
$formats = [
|
||||
'PDF' => 'document.pdf',
|
||||
'Word' => 'document.docx',
|
||||
'Excel' => 'spreadsheet.xlsx',
|
||||
'PowerPoint' => 'presentation.pptx',
|
||||
'Text' => 'readme.txt',
|
||||
'HTML' => 'page.html',
|
||||
'Markdown' => 'guide.md',
|
||||
'Image' => 'scan.png',
|
||||
];
|
||||
|
||||
echo "Multi-Format Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
|
||||
foreach ($formats as $type => $file) {
|
||||
if (!file_exists($file)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
echo "Processing $type ($file):\n";
|
||||
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
echo " MIME type: $mimeType\n";
|
||||
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo " Content length: " . strlen($result->content) . " chars\n";
|
||||
echo " Tables: " . count($result->tables) . "\n";
|
||||
echo " Images: " . count($result->images ?? []) . "\n";
|
||||
echo " Pages: " . ($result->metadata->pageCount ?? 'N/A') . "\n";
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
$mixedFiles = glob('documents/*.*');
|
||||
$byFormat = [];
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
$extension = pathinfo($file, PATHINFO_EXTENSION);
|
||||
|
||||
if (!isset($byFormat[$extension])) {
|
||||
$byFormat[$extension] = [];
|
||||
}
|
||||
|
||||
$result = extract_file($file);
|
||||
$byFormat[$extension][] = [
|
||||
'file' => basename($file),
|
||||
'mime' => $mimeType,
|
||||
'size' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
];
|
||||
}
|
||||
|
||||
echo "Files by Format:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($byFormat as $ext => $files) {
|
||||
echo strtoupper($ext) . ": " . count($files) . " files\n";
|
||||
|
||||
$totalSize = array_sum(array_column($files, 'size'));
|
||||
$totalTables = array_sum(array_column($files, 'tables'));
|
||||
|
||||
echo " Total content: " . number_format($totalSize) . " chars\n";
|
||||
echo " Total tables: $totalTables\n\n";
|
||||
}
|
||||
|
||||
$formatConfigs = [
|
||||
'pdf' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
extractImages: true,
|
||||
pdf: new \Kreuzberg\Config\PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 85
|
||||
)
|
||||
),
|
||||
'docx' => new ExtractionConfig(
|
||||
extractTables: true,
|
||||
preserveFormatting: true
|
||||
),
|
||||
'xlsx' => new ExtractionConfig(
|
||||
extractTables: true
|
||||
),
|
||||
'png' => new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
),
|
||||
];
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));
|
||||
|
||||
if (!isset($formatConfigs[$ext])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$config = $formatConfigs[$ext];
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
echo "Processed " . basename($file) . " with $ext config\n";
|
||||
}
|
||||
|
||||
function convertToMarkdown(string $inputFile): string
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
preserveFormatting: true,
|
||||
outputFormat: 'markdown',
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($inputFile);
|
||||
|
||||
$markdown = "# " . ($result->metadata->title ?? basename($inputFile)) . "\n\n";
|
||||
|
||||
if (isset($result->metadata->authors)) {
|
||||
$markdown .= "_Authors: " . implode(', ', $result->metadata->authors) . "_\n\n";
|
||||
}
|
||||
|
||||
$markdown .= $result->content . "\n\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$markdown .= "## Table " . ($index + 1) . "\n\n";
|
||||
$markdown .= $table->markdown . "\n\n";
|
||||
}
|
||||
|
||||
return $markdown;
|
||||
}
|
||||
|
||||
echo "\nConverting to Markdown:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach (['document.pdf', 'document.docx'] as $file) {
|
||||
if (!file_exists($file)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$markdown = convertToMarkdown($file);
|
||||
$outputFile = pathinfo($file, PATHINFO_FILENAME) . '.md';
|
||||
|
||||
file_put_contents($outputFile, $markdown);
|
||||
echo "Converted: $file -> $outputFile\n";
|
||||
}
|
||||
|
||||
function extractFromArchive(string $archiveFile): array
|
||||
{
|
||||
$result = extract_file($archiveFile);
|
||||
|
||||
return [
|
||||
'archive' => basename($archiveFile),
|
||||
'listing' => $result->content,
|
||||
'mime' => $result->mimeType,
|
||||
];
|
||||
}
|
||||
|
||||
class UniversalExtractor
|
||||
{
|
||||
private Kreuzberg $kreuzberg;
|
||||
private array $formatHandlers = [];
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
$this->kreuzberg = new Kreuzberg();
|
||||
|
||||
$this->formatHandlers = [
|
||||
'application/pdf' => [$this, 'handlePDF'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => [$this, 'handleDOCX'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => [$this, 'handleXLSX'],
|
||||
'image/png' => [$this, 'handleImage'],
|
||||
'image/jpeg' => [$this, 'handleImage'],
|
||||
];
|
||||
}
|
||||
|
||||
public function extract(string $file): array
|
||||
{
|
||||
$mimeType = detect_mime_type_from_path($file);
|
||||
$handler = $this->formatHandlers[$mimeType] ?? [$this, 'handleGeneric'];
|
||||
|
||||
return $handler($file, $mimeType);
|
||||
}
|
||||
|
||||
private function handlePDF(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(extractTables: true, extractImages: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'PDF',
|
||||
'content' => $result->content,
|
||||
'tables' => count($result->tables),
|
||||
'images' => count($result->images ?? []),
|
||||
'pages' => $result->metadata->pageCount,
|
||||
];
|
||||
}
|
||||
|
||||
private function handleDOCX(string $file, string $mimeType): array
|
||||
{
|
||||
$result = $this->kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Word Document',
|
||||
'content' => $result->content,
|
||||
'tables' => count($result->tables),
|
||||
'authors' => $result->metadata->authors,
|
||||
];
|
||||
}
|
||||
|
||||
private function handleXLSX(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(extractTables: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Excel Spreadsheet',
|
||||
'content' => $result->content,
|
||||
'sheets' => count($result->tables),
|
||||
];
|
||||
}
|
||||
|
||||
private function handleImage(string $file, string $mimeType): array
|
||||
{
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(backend: 'tesseract', language: 'eng')
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Image (OCR)',
|
||||
'content' => $result->content,
|
||||
'ocr_length' => strlen($result->content),
|
||||
];
|
||||
}
|
||||
|
||||
private function handleGeneric(string $file, string $mimeType): array
|
||||
{
|
||||
$result = $this->kreuzberg->extractFile($file);
|
||||
|
||||
return [
|
||||
'type' => 'Generic',
|
||||
'mime' => $mimeType,
|
||||
'content' => $result->content,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
$extractor = new UniversalExtractor();
|
||||
|
||||
echo "\nUniversal Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
|
||||
foreach ($mixedFiles as $file) {
|
||||
$data = $extractor->extract($file);
|
||||
echo basename($file) . " ({$data['type']}):\n";
|
||||
print_r(array_filter($data, fn($k) => $k !== 'content', ARRAY_FILTER_USE_KEY));
|
||||
echo "\n";
|
||||
}
|
||||
```
|
||||
114
docs/snippets/php/extraction/pdf_extraction.php
Normal file
114
docs/snippets/php/extraction/pdf_extraction.php
Normal file
@@ -0,0 +1,114 @@
|
||||
```php title="pdf_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PDF Document Extraction
|
||||
*
|
||||
* Extract text, tables, and images from PDF files with various configurations.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
|
||||
echo "PDF Extraction Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n";
|
||||
echo "Pages: " . ($result->metadata->pageCount ?? 'unknown') . "\n\n";
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
extractImages: true,
|
||||
extractTables: true,
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
imageQuality: 85
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('report.pdf');
|
||||
|
||||
echo "Extracted Tables:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
|
||||
|
||||
echo "Markdown format:\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
$csvFile = "table_{$index}.csv";
|
||||
$fp = fopen($csvFile, 'w');
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
fclose($fp);
|
||||
echo "Saved to: $csvFile\n\n";
|
||||
}
|
||||
|
||||
echo "Extracted Images:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
foreach ($result->images ?? [] as $image) {
|
||||
$filename = sprintf(
|
||||
'page_%d_image_%d.%s',
|
||||
$image->pageNumber,
|
||||
$image->imageIndex,
|
||||
$image->format
|
||||
);
|
||||
|
||||
file_put_contents($filename, $image->data);
|
||||
echo "Saved: $filename\n";
|
||||
echo " Size: {$image->width}x{$image->height}\n";
|
||||
echo " Format: {$image->format}\n";
|
||||
echo " Data size: " . strlen($image->data) . " bytes\n\n";
|
||||
}
|
||||
|
||||
$formattedConfig = new ExtractionConfig(
|
||||
preserveFormatting: true,
|
||||
outputFormat: 'markdown'
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($formattedConfig);
|
||||
$result = $kreuzberg->extractFile('formatted.pdf');
|
||||
|
||||
file_put_contents('output.md', $result->content);
|
||||
echo "Saved formatted output to: output.md\n";
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
$content = $result->content;
|
||||
|
||||
$sections = [];
|
||||
$lines = explode("\n", $content);
|
||||
$currentSection = null;
|
||||
$currentContent = [];
|
||||
|
||||
foreach ($lines as $line) {
|
||||
if (preg_match('/^#+\s+(.+)$/', $line, $matches)) {
|
||||
if ($currentSection !== null) {
|
||||
$sections[$currentSection] = implode("\n", $currentContent);
|
||||
}
|
||||
$currentSection = $matches[1];
|
||||
$currentContent = [];
|
||||
} else {
|
||||
$currentContent[] = $line;
|
||||
}
|
||||
}
|
||||
|
||||
if ($currentSection !== null) {
|
||||
$sections[$currentSection] = implode("\n", $currentContent);
|
||||
}
|
||||
|
||||
echo "\nDocument sections:\n";
|
||||
foreach ($sections as $title => $content) {
|
||||
echo " - $title (" . strlen($content) . " chars)\n";
|
||||
}
|
||||
```
|
||||
195
docs/snippets/php/extraction/powerpoint_extraction.php
Normal file
195
docs/snippets/php/extraction/powerpoint_extraction.php
Normal file
@@ -0,0 +1,195 @@
|
||||
```php title="powerpoint_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* PowerPoint Presentation Extraction
|
||||
*
|
||||
* This example demonstrates extracting content from PowerPoint files (.pptx, .ppt),
|
||||
* including text, notes, images, and tables from slides.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\ImageExtractionConfig;
|
||||
use Kreuzberg\Config\PageConfig;
|
||||
|
||||
echo "Example 1: Basic PowerPoint Extraction\n";
|
||||
echo "======================================\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('presentation.pptx');
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
echo "Metadata:\n";
|
||||
echo "- Title: " . ($result->metadata->title ?? 'N/A') . "\n";
|
||||
echo "- Author: " . (isset($result->metadata->authors) ? implode(', ', $result->metadata->authors) : 'N/A') . "\n";
|
||||
echo "- Slide Count: " . ($result->metadata->pageCount ?? 'N/A') . "\n\n";
|
||||
|
||||
echo "Example 2: Extract Per-Slide Content\n";
|
||||
echo "====================================\n";
|
||||
|
||||
$config2 = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: '--- Slide {page_number} ---'
|
||||
)
|
||||
);
|
||||
|
||||
$result2 = (new Kreuzberg($config2))->extractFile('presentation.pptx');
|
||||
|
||||
if ($result2->pages !== null) {
|
||||
echo "Total slides: " . count($result2->pages) . "\n\n";
|
||||
|
||||
foreach ($result2->pages as $page) {
|
||||
echo "Slide {$page->pageNumber}:\n";
|
||||
echo "- Text length: " . strlen($page->content) . " characters\n";
|
||||
echo "- Tables: " . count($page->tables) . "\n";
|
||||
echo "- Images: " . count($page->images) . "\n";
|
||||
echo "- Content preview: " . substr($page->content, 0, 100) . "...\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 3: Extract Images from Slides\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config3 = new ExtractionConfig(
|
||||
imageExtraction: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
minWidth: 100,
|
||||
minHeight: 100
|
||||
)
|
||||
);
|
||||
|
||||
$result3 = (new Kreuzberg($config3))->extractFile('presentation.pptx');
|
||||
|
||||
if ($result3->images !== null) {
|
||||
echo "Total images: " . count($result3->images) . "\n\n";
|
||||
|
||||
foreach ($result3->images as $i => $image) {
|
||||
echo "Image {$i}:\n";
|
||||
echo "- Format: {$image->format}\n";
|
||||
echo "- Size: {$image->width}x{$image->height}\n";
|
||||
echo "- Slide: {$image->pageNumber}\n";
|
||||
|
||||
$filename = "slide_{$image->pageNumber}_image_{$i}.{$image->format}";
|
||||
file_put_contents($filename, base64_decode($image->data));
|
||||
echo "- Saved: {$filename}\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 4: Extract Tables from Slides\n";
|
||||
echo "=====================================\n";
|
||||
|
||||
$config4 = new ExtractionConfig(
|
||||
extractTables: true
|
||||
);
|
||||
|
||||
$result4 = (new Kreuzberg($config4))->extractFile('data_presentation.pptx');
|
||||
|
||||
if (count($result4->tables) > 0) {
|
||||
echo "Found " . count($result4->tables) . " table(s)\n\n";
|
||||
|
||||
foreach ($result4->tables as $i => $table) {
|
||||
echo "Table " . ($i + 1) . " (Slide {$table->pageNumber}):\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "Example 5: Convert PowerPoint to Markdown\n";
|
||||
echo "=========================================\n";
|
||||
|
||||
$config5 = new ExtractionConfig(
|
||||
page: new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: '---\n\n## Slide {page_number}\n\n'
|
||||
),
|
||||
outputFormat: 'markdown'
|
||||
);
|
||||
|
||||
$result5 = (new Kreuzberg($config5))->extractFile('presentation.pptx');
|
||||
|
||||
$markdownContent = $result5->content;
|
||||
file_put_contents('presentation.md', $markdownContent);
|
||||
|
||||
echo "Converted to Markdown\n";
|
||||
echo "Saved as: presentation.md\n";
|
||||
echo "Content preview:\n";
|
||||
echo substr($markdownContent, 0, 500) . "...\n\n";
|
||||
|
||||
echo "Example 6: Generate Presentation Summary\n";
|
||||
echo "========================================\n";
|
||||
|
||||
$config6 = new ExtractionConfig(
|
||||
page: new PageConfig(extractPages: true)
|
||||
);
|
||||
|
||||
$result6 = (new Kreuzberg($config6))->extractFile('meeting_deck.pptx');
|
||||
|
||||
echo "Presentation Summary:\n";
|
||||
echo "====================\n";
|
||||
echo "Title: " . ($result6->metadata->title ?? 'Untitled') . "\n";
|
||||
echo "Author: " . (isset($result6->metadata->authors) ? implode(', ', $result6->metadata->authors) : 'Unknown') . "\n";
|
||||
echo "Total Slides: " . ($result6->metadata->pageCount ?? count($result6->pages ?? [])) . "\n";
|
||||
echo "Total Text: " . strlen($result6->content) . " characters\n";
|
||||
echo "Tables: " . count($result6->tables) . "\n";
|
||||
|
||||
if ($result6->pages !== null) {
|
||||
echo "\nSlide Breakdown:\n";
|
||||
foreach ($result6->pages as $page) {
|
||||
$wordCount = str_word_count($page->content);
|
||||
echo "- Slide {$page->pageNumber}: {$wordCount} words, " . count($page->tables) . " tables\n";
|
||||
}
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
|
||||
echo "Example 7: Search Content in Slides\n";
|
||||
echo "===================================\n";
|
||||
|
||||
$config7 = new ExtractionConfig(
|
||||
page: new PageConfig(extractPages: true)
|
||||
);
|
||||
|
||||
$result7 = (new Kreuzberg($config7))->extractFile('presentation.pptx');
|
||||
|
||||
$searchTerm = "revenue";
|
||||
|
||||
if ($result7->pages !== null) {
|
||||
echo "Searching for '{$searchTerm}':\n\n";
|
||||
|
||||
foreach ($result7->pages as $page) {
|
||||
if (stripos($page->content, $searchTerm) !== false) {
|
||||
echo "Found in Slide {$page->pageNumber}:\n";
|
||||
|
||||
$pos = stripos($page->content, $searchTerm);
|
||||
$context = substr($page->content, max(0, $pos - 50), 150);
|
||||
echo "- Context: ...{$context}...\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nSupported PowerPoint Formats:\n";
|
||||
echo "=============================\n";
|
||||
echo "- .pptx (PowerPoint 2007+)\n";
|
||||
echo "- .ppt (PowerPoint 97-2003)\n";
|
||||
echo "- .pptm (Macro-enabled)\n";
|
||||
echo "- .potx (Template)\n";
|
||||
|
||||
echo "\n\nBest Practices:\n";
|
||||
echo "===============\n";
|
||||
echo "1. Use page extraction to process individual slides\n";
|
||||
echo "2. Extract images for visual content analysis\n";
|
||||
echo "3. Extract tables for data analysis\n";
|
||||
echo "4. Use metadata for presentation information\n";
|
||||
echo "5. Convert to Markdown for documentation\n";
|
||||
echo "6. Search across slides for specific content\n";
|
||||
echo "7. Generate summaries for presentation overviews\n";
|
||||
```
|
||||
217
docs/snippets/php/extraction/table_extraction.php
Normal file
217
docs/snippets/php/extraction/table_extraction.php
Normal file
@@ -0,0 +1,217 @@
|
||||
```php title="table_extraction.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Table Extraction and Processing
|
||||
*
|
||||
* Extract tables from PDFs and other documents, process them,
|
||||
* and export to various formats (CSV, JSON, HTML).
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Config\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(extractTables: true);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('financial_report.pdf');
|
||||
|
||||
echo "Table Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables found: " . count($result->tables) . "\n\n";
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
echo "Table " . ($index + 1) . " (Page {$table->pageNumber}):\n";
|
||||
echo str_repeat('-', 60) . "\n";
|
||||
|
||||
echo "Markdown:\n";
|
||||
echo $table->markdown . "\n\n";
|
||||
|
||||
echo "Array format:\n";
|
||||
echo "Rows: " . count($table->cells) . "\n";
|
||||
echo "Columns: " . (count($table->cells[0] ?? []) ?? 0) . "\n\n";
|
||||
|
||||
echo "HTML:\n";
|
||||
echo "<table>\n";
|
||||
foreach ($table->cells as $rowIndex => $row) {
|
||||
$tag = $rowIndex === 0 ? 'th' : 'td';
|
||||
echo " <tr>\n";
|
||||
foreach ($row as $cell) {
|
||||
echo " <$tag>" . htmlspecialchars($cell) . "</$tag>\n";
|
||||
}
|
||||
echo " </tr>\n";
|
||||
}
|
||||
echo "</table>\n\n";
|
||||
}
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$filename = "table_" . ($index + 1) . "_page_" . $table->pageNumber . ".csv";
|
||||
$fp = fopen($filename, 'w');
|
||||
|
||||
foreach ($table->cells as $row) {
|
||||
fputcsv($fp, $row);
|
||||
}
|
||||
|
||||
fclose($fp);
|
||||
echo "Exported to: $filename\n";
|
||||
}
|
||||
echo "\n";
|
||||
|
||||
$ocrConfig = new ExtractionConfig(
|
||||
extractTables: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
enableTableDetection: true,
|
||||
psm: 6
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($ocrConfig);
|
||||
$result = $kreuzberg->extractFile('scanned_table.pdf');
|
||||
|
||||
echo "OCR Table Extraction:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Tables with OCR: " . count($result->tables) . "\n\n";
|
||||
|
||||
function processTable(array $cells): array
|
||||
{
|
||||
$processed = [];
|
||||
|
||||
$headers = array_shift($cells);
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$rowData = [];
|
||||
foreach ($headers as $index => $header) {
|
||||
$rowData[$header] = $row[$index] ?? '';
|
||||
}
|
||||
$processed[] = $rowData;
|
||||
}
|
||||
|
||||
return $processed;
|
||||
}
|
||||
|
||||
foreach ($result->tables as $table) {
|
||||
$structured = processTable($table->cells);
|
||||
|
||||
echo "Structured table data:\n";
|
||||
echo json_encode($structured, JSON_PRETTY_PRINT) . "\n\n";
|
||||
}
|
||||
|
||||
function findTablesWithKeyword(array $tables, string $keyword): array
|
||||
{
|
||||
$matching = [];
|
||||
|
||||
foreach ($tables as $table) {
|
||||
foreach ($table->cells as $row) {
|
||||
foreach ($row as $cell) {
|
||||
if (stripos($cell, $keyword) !== false) {
|
||||
$matching[] = $table;
|
||||
break 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $matching;
|
||||
}
|
||||
|
||||
$salesTables = findTablesWithKeyword($result->tables, 'sales');
|
||||
echo "Tables containing 'sales': " . count($salesTables) . "\n";
|
||||
|
||||
function tableToAssociativeArray(\Kreuzberg\Types\Table $table): array
|
||||
{
|
||||
$cells = $table->cells;
|
||||
if (empty($cells)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$headers = array_shift($cells);
|
||||
$result = [];
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$rowData = [];
|
||||
foreach ($headers as $index => $header) {
|
||||
$rowData[$header] = $row[$index] ?? null;
|
||||
}
|
||||
$result[] = $rowData;
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile('quarterly_report.pdf');
|
||||
|
||||
foreach ($result->tables as $index => $table) {
|
||||
$data = tableToAssociativeArray($table);
|
||||
|
||||
echo "\nTable " . ($index + 1) . " data:\n";
|
||||
|
||||
$totals = [];
|
||||
foreach ($data as $row) {
|
||||
foreach ($row as $key => $value) {
|
||||
if (is_numeric($value)) {
|
||||
if (!isset($totals[$key])) {
|
||||
$totals[$key] = 0;
|
||||
}
|
||||
$totals[$key] += floatval($value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($totals)) {
|
||||
echo "Column totals:\n";
|
||||
foreach ($totals as $column => $total) {
|
||||
echo " $column: " . number_format($total, 2) . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$allTablesJson = array_map(function ($table) {
|
||||
return [
|
||||
'page' => $table->pageNumber,
|
||||
'rows' => count($table->cells),
|
||||
'columns' => count($table->cells[0] ?? []),
|
||||
'data' => tableToAssociativeArray($table),
|
||||
'markdown' => $table->markdown,
|
||||
];
|
||||
}, $result->tables);
|
||||
|
||||
file_put_contents('tables.json', json_encode($allTablesJson, JSON_PRETTY_PRINT));
|
||||
echo "\nAll tables exported to: tables.json\n";
|
||||
|
||||
function mergeTables(array $tables): array
|
||||
{
|
||||
if (empty($tables)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$merged = [];
|
||||
$headers = $tables[0]->cells[0] ?? [];
|
||||
|
||||
foreach ($tables as $table) {
|
||||
$cells = $table->cells;
|
||||
array_shift($cells);
|
||||
|
||||
foreach ($cells as $row) {
|
||||
$merged[] = $row;
|
||||
}
|
||||
}
|
||||
|
||||
return ['headers' => $headers, 'data' => $merged];
|
||||
}
|
||||
|
||||
$reportTables = findTablesWithKeyword($result->tables, 'Quarter');
|
||||
if (!empty($reportTables)) {
|
||||
$merged = mergeTables($reportTables);
|
||||
echo "\nMerged " . count($reportTables) . " tables\n";
|
||||
echo "Total rows: " . count($merged['data']) . "\n";
|
||||
}
|
||||
```
|
||||
13
docs/snippets/php/getting-started/basic_usage.md
Normal file
13
docs/snippets/php/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->getContent();
|
||||
```
|
||||
14
docs/snippets/php/getting-started/extract_file.md
Normal file
14
docs/snippets/php/getting-started/extract_file.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content: " . $result->getContent() . "\n";
|
||||
echo "MIME Type: " . $result->getMimeType() . "\n";
|
||||
echo "Tables: " . count($result->getTables()) . "\n";
|
||||
```
|
||||
25
docs/snippets/php/getting-started/extract_with_ocr.md
Normal file
25
docs/snippets/php/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
|
||||
$ocrConfig = new OcrConfig();
|
||||
$ocrConfig->setBackend('tesseract');
|
||||
$ocrConfig->setLanguage('eng');
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->setForceOcr(true);
|
||||
$config->setOcr($ocrConfig);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
|
||||
|
||||
echo "Content:\n";
|
||||
echo $result->getContent();
|
||||
|
||||
if ($result->getDetectedLanguages() !== null) {
|
||||
echo "Detected Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
```
|
||||
9
docs/snippets/php/getting-started/hello_world.md
Normal file
9
docs/snippets/php/getting-started/hello_world.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, null);
|
||||
echo "Hello, " . substr($result->getContent(), 0, 50) . "\n";
|
||||
```
|
||||
10
docs/snippets/php/getting-started/install_verify.md
Normal file
10
docs/snippets/php/getting-started/install_verify.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
if (extension_loaded('kreuzberg')) {
|
||||
echo "Kreuzberg extension loaded successfully.\n";
|
||||
} else {
|
||||
echo "Kreuzberg extension not loaded.\n";
|
||||
}
|
||||
```
|
||||
24
docs/snippets/php/getting-started/read_content.md
Normal file
24
docs/snippets/php/getting-started/read_content.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->setChunking(new ChunkingConfig());
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Total content length: " . strlen($result->getContent()) . "\n";
|
||||
|
||||
if ($result->getChunks() !== null) {
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk: " . $chunk->getContent() . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($result->getTables() as $table) {
|
||||
echo "Table with " . count($table->getRows()) . " rows\n";
|
||||
}
|
||||
```
|
||||
30
docs/snippets/php/installation/composer_install.php
Normal file
30
docs/snippets/php/installation/composer_install.php
Normal file
@@ -0,0 +1,30 @@
|
||||
```php title="composer_install.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Installing Kreuzberg via Composer
|
||||
*
|
||||
* This snippet shows how to install the Kreuzberg PHP package using Composer.
|
||||
* The package provides the object-oriented and procedural APIs, while the
|
||||
* native extension (kreuzberg.so/.dll) must be installed separately.
|
||||
*/
|
||||
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
if (!extension_loaded('kreuzberg')) {
|
||||
echo "Error: kreuzberg extension is not loaded\n";
|
||||
echo "Please add 'extension=kreuzberg.so' (or .dll on Windows) to your php.ini\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
echo "Kreuzberg extension is loaded successfully!\n";
|
||||
echo "Version: " . Kreuzberg::version() . "\n";
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
echo "Kreuzberg client initialized successfully!\n";
|
||||
```
|
||||
50
docs/snippets/php/installation/extension_setup.php
Normal file
50
docs/snippets/php/installation/extension_setup.php
Normal file
@@ -0,0 +1,50 @@
|
||||
```php title="extension_setup.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Setting up the Kreuzberg PHP Extension
|
||||
*
|
||||
* The Kreuzberg native extension must be installed and loaded before using the library.
|
||||
* This snippet shows how to check for the extension and provides guidance for installation.
|
||||
*/
|
||||
|
||||
if (!extension_loaded('kreuzberg')) {
|
||||
echo "Kreuzberg extension not found!\n\n";
|
||||
echo "Installation steps:\n";
|
||||
echo "1. Download the extension for your platform from:\n";
|
||||
echo " https://github.com/kreuzberg-dev/kreuzberg/releases\n\n";
|
||||
echo "2. Copy the extension to your PHP extensions directory:\n";
|
||||
echo " - Linux/macOS: kreuzberg.so\n";
|
||||
echo " - Windows: kreuzberg.dll\n\n";
|
||||
echo "3. Add to your php.ini:\n";
|
||||
echo " extension=kreuzberg.so ; Linux/macOS\n";
|
||||
echo " extension=kreuzberg.dll ; Windows\n\n";
|
||||
echo "4. Restart PHP/PHP-FPM/Apache\n\n";
|
||||
echo "5. Verify with: php -m | grep kreuzberg\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
echo "Kreuzberg Extension Information:\n";
|
||||
echo "================================\n";
|
||||
echo "Status: Loaded\n";
|
||||
|
||||
$tesseract_available = function_exists('kreuzberg_has_tesseract') ? kreuzberg_has_tesseract() : false;
|
||||
$onnx_available = function_exists('kreuzberg_has_onnx') ? kreuzberg_has_onnx() : false;
|
||||
|
||||
echo "Tesseract OCR: " . ($tesseract_available ? "Available" : "Not available") . "\n";
|
||||
echo "ONNX Runtime: " . ($onnx_available ? "Available" : "Not available") . "\n";
|
||||
|
||||
if (!$tesseract_available) {
|
||||
echo "\nTo enable OCR functionality, install Tesseract:\n";
|
||||
echo " macOS: brew install tesseract\n";
|
||||
echo " Ubuntu/Debian: sudo apt install tesseract-ocr\n";
|
||||
}
|
||||
|
||||
if (!$onnx_available) {
|
||||
echo "\nTo enable embeddings, install ONNX Runtime:\n";
|
||||
echo " macOS: brew install onnxruntime\n";
|
||||
echo " Ubuntu/Debian: sudo apt install libonnxruntime\n";
|
||||
}
|
||||
```
|
||||
86
docs/snippets/php/installation/pie_install.php
Normal file
86
docs/snippets/php/installation/pie_install.php
Normal file
@@ -0,0 +1,86 @@
|
||||
```php title="pie_install.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Installing Kreuzberg PHP Extension using PIE
|
||||
*
|
||||
* PIE (PHP Installer for Extensions) is a modern tool for installing PHP extensions.
|
||||
* This snippet shows how to install the Kreuzberg extension using PIE.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
echo "Kreuzberg Extension Installation Check\n";
|
||||
echo "========================================\n\n";
|
||||
|
||||
if (extension_loaded('kreuzberg')) {
|
||||
echo "✓ Kreuzberg extension is loaded\n";
|
||||
echo " Version: " . Kreuzberg::version() . "\n\n";
|
||||
|
||||
$info = [];
|
||||
ob_start();
|
||||
phpinfo(INFO_MODULES);
|
||||
$phpinfo = ob_get_clean();
|
||||
|
||||
if (preg_match('/kreuzberg/i', $phpinfo)) {
|
||||
echo "✓ Extension info available via phpinfo()\n\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$kreuzberg = new Kreuzberg();
|
||||
echo "✓ Kreuzberg client initialized successfully\n\n";
|
||||
|
||||
echo "Installation complete!\n";
|
||||
echo "You can now use Kreuzberg in your PHP applications.\n";
|
||||
} catch (Exception $e) {
|
||||
echo "✗ Error initializing Kreuzberg: {$e->getMessage()}\n";
|
||||
}
|
||||
} else {
|
||||
echo "✗ Kreuzberg extension is not loaded\n\n";
|
||||
|
||||
echo "Troubleshooting:\n";
|
||||
echo "================\n";
|
||||
echo "1. Make sure PIE installation completed successfully\n";
|
||||
echo "2. Check that extension is enabled in php.ini\n";
|
||||
echo "3. Restart your web server/PHP-FPM\n";
|
||||
echo "4. Run: php -m | grep kreuzberg\n";
|
||||
echo "5. Check error logs for loading issues\n\n";
|
||||
|
||||
echo "Manual Installation:\n";
|
||||
echo "===================\n";
|
||||
echo "If PIE installation fails, try manual installation:\n";
|
||||
echo "1. Download extension from GitHub releases\n";
|
||||
echo "2. Copy .so/.dll file to PHP extension directory\n";
|
||||
echo "3. Add 'extension=kreuzberg.so' to php.ini\n";
|
||||
echo "4. Restart PHP\n";
|
||||
}
|
||||
|
||||
echo "\n\nPIE Commands Reference:\n";
|
||||
echo "=======================\n";
|
||||
echo "Install extension: pie install kreuzberg/kreuzberg-ext\n";
|
||||
echo "Install specific version: pie install kreuzberg/kreuzberg-ext:4.2.7\n";
|
||||
echo "List installed: pie list\n";
|
||||
echo "Update extension: pie update kreuzberg/kreuzberg-ext\n";
|
||||
echo "Uninstall: pie uninstall kreuzberg/kreuzberg-ext\n";
|
||||
echo "Show info: pie info kreuzberg/kreuzberg-ext\n";
|
||||
|
||||
echo "\n\nNext Steps:\n";
|
||||
echo "===========\n";
|
||||
echo "1. Install Composer package: composer require kreuzberg/kreuzberg\n";
|
||||
echo "2. Install optional dependencies:\n";
|
||||
echo " - Tesseract OCR: brew install tesseract (macOS) or apt install tesseract-ocr (Linux)\n";
|
||||
echo " - ONNX Runtime: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux)\n";
|
||||
echo "3. Start extracting documents!\n";
|
||||
```
|
||||
81
docs/snippets/php/installation/requirements_check.php
Normal file
81
docs/snippets/php/installation/requirements_check.php
Normal file
@@ -0,0 +1,81 @@
|
||||
```php title="requirements_check.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* System Requirements Check
|
||||
*
|
||||
* Verify that your system meets all requirements for running Kreuzberg.
|
||||
*/
|
||||
|
||||
echo "Kreuzberg System Requirements Check\n";
|
||||
echo "====================================\n\n";
|
||||
|
||||
$requirements_met = true;
|
||||
|
||||
echo "PHP Version: " . PHP_VERSION;
|
||||
if (version_compare(PHP_VERSION, '8.1.0', '>=')) {
|
||||
echo " ✓ (>= 8.1.0 required)\n";
|
||||
} else {
|
||||
echo " ✗ (>= 8.1.0 required)\n";
|
||||
$requirements_met = false;
|
||||
}
|
||||
|
||||
$required_extensions = ['json', 'mbstring'];
|
||||
foreach ($required_extensions as $ext) {
|
||||
echo "Extension '$ext': ";
|
||||
if (extension_loaded($ext)) {
|
||||
echo "✓ Loaded\n";
|
||||
} else {
|
||||
echo "✗ Missing\n";
|
||||
$requirements_met = false;
|
||||
}
|
||||
}
|
||||
|
||||
echo "Extension 'kreuzberg': ";
|
||||
if (extension_loaded('kreuzberg')) {
|
||||
echo "✓ Loaded\n";
|
||||
} else {
|
||||
echo "✗ Missing\n";
|
||||
$requirements_met = false;
|
||||
}
|
||||
|
||||
$memory_limit = ini_get('memory_limit');
|
||||
echo "\nMemory Limit: $memory_limit";
|
||||
$memory_bytes = return_bytes($memory_limit);
|
||||
if ($memory_bytes >= 128 * 1024 * 1024) {
|
||||
echo " ✓ (>= 128M recommended)\n";
|
||||
} else {
|
||||
echo " ! (>= 128M recommended for large documents)\n";
|
||||
}
|
||||
|
||||
echo "\n";
|
||||
if ($requirements_met) {
|
||||
echo "✓ All requirements met! You're ready to use Kreuzberg.\n";
|
||||
} else {
|
||||
echo "✗ Some requirements are not met. Please install missing components.\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert PHP memory limit notation to bytes
|
||||
*/
|
||||
function return_bytes(string $val): int
|
||||
{
|
||||
$val = trim($val);
|
||||
$last = strtolower($val[strlen($val) - 1]);
|
||||
$val = (int) $val;
|
||||
|
||||
switch ($last) {
|
||||
case 'g':
|
||||
$val *= 1024;
|
||||
case 'm':
|
||||
$val *= 1024;
|
||||
case 'k':
|
||||
$val *= 1024;
|
||||
}
|
||||
|
||||
return $val;
|
||||
}
|
||||
```
|
||||
54
docs/snippets/php/llm/structured_extraction.md
Normal file
54
docs/snippets/php/llm/structured_extraction.md
Normal file
@@ -0,0 +1,54 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LlmConfig;
|
||||
use Kreuzberg\StructuredExtractionConfig;
|
||||
|
||||
$schema = json_encode([
|
||||
'type' => 'object',
|
||||
'properties' => [
|
||||
'title' => ['type' => 'string'],
|
||||
'authors' => ['type' => 'array', 'items' => ['type' => 'string']],
|
||||
'date' => ['type' => 'string'],
|
||||
],
|
||||
'required' => ['title', 'authors', 'date'],
|
||||
'additionalProperties' => false,
|
||||
], JSON_THROW_ON_ERROR);
|
||||
|
||||
$llm = new LlmConfig(
|
||||
model: 'openai/gpt-4o-mini',
|
||||
apiKey: null,
|
||||
baseUrl: null,
|
||||
timeoutSecs: null,
|
||||
maxRetries: null,
|
||||
temperature: null,
|
||||
maxTokens: null,
|
||||
);
|
||||
|
||||
$structured = StructuredExtractionConfig::from_json(json_encode([
|
||||
'schema' => json_decode($schema, true),
|
||||
'schema_name' => 'paper_metadata',
|
||||
'strict' => true,
|
||||
'llm' => [
|
||||
'model' => $llm->model,
|
||||
],
|
||||
], JSON_THROW_ON_ERROR));
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->structured_extraction = $structured;
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('paper.pdf');
|
||||
|
||||
if ($result->structured_output !== null) {
|
||||
echo $result->structured_output, "\n";
|
||||
}
|
||||
```
|
||||
39
docs/snippets/php/mcp/mcp_custom_client.md
Normal file
39
docs/snippets/php/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
$descriptors = [
|
||||
0 => ['pipe', 'r'],
|
||||
1 => ['pipe', 'w'],
|
||||
2 => ['pipe', 'w'],
|
||||
];
|
||||
|
||||
$process = proc_open(['kreuzberg', 'mcp'], $descriptors, $pipes);
|
||||
if (!is_resource($process)) {
|
||||
throw new RuntimeException('failed to spawn kreuzberg mcp');
|
||||
}
|
||||
|
||||
$request = [
|
||||
'method' => 'tools/call',
|
||||
'params' => [
|
||||
'name' => 'extract_file',
|
||||
'arguments' => [
|
||||
'path' => 'document.pdf',
|
||||
'async' => true,
|
||||
],
|
||||
],
|
||||
];
|
||||
|
||||
fwrite($pipes[0], json_encode($request, JSON_THROW_ON_ERROR) . "\n");
|
||||
fclose($pipes[0]);
|
||||
|
||||
$response = fgets($pipes[1]);
|
||||
if ($response !== false) {
|
||||
echo $response;
|
||||
}
|
||||
|
||||
fclose($pipes[1]);
|
||||
fclose($pipes[2]);
|
||||
proc_close($process);
|
||||
```
|
||||
132
docs/snippets/php/mcp/mcp_http_client.md
Normal file
132
docs/snippets/php/mcp/mcp_http_client.md
Normal file
@@ -0,0 +1,132 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\GuzzleException;
|
||||
|
||||
/**
|
||||
* Connect to Kreuzberg MCP server via HTTP transport.
|
||||
*
|
||||
* Requires MCP server running with HTTP transport:
|
||||
* kreuzberg mcp --transport http --port 3000
|
||||
*/
|
||||
final readonly class KreuzbergMcpClient
|
||||
{
|
||||
private Client $http;
|
||||
|
||||
public function __construct(
|
||||
private string $baseUrl = 'http://localhost:3000',
|
||||
) {
|
||||
$this->http = new Client([
|
||||
'base_uri' => $this->baseUrl,
|
||||
'timeout' => 30.0,
|
||||
'headers' => [
|
||||
'Content-Type' => 'application/json',
|
||||
'Accept' => 'application/json',
|
||||
],
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function initialize(): array
|
||||
{
|
||||
$response = $this->http->post('/initialize', [
|
||||
'json' => [
|
||||
'protocolVersion' => '2024-11-05',
|
||||
'capabilities' => [],
|
||||
'clientInfo' => [
|
||||
'name' => 'kreuzberg-php-client',
|
||||
'version' => '4.2.7',
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
return json_decode($response->getBody()->getContents(), true, 512, JSON_THROW_ON_ERROR);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, array{name: string, description: string, inputSchema: array<string, mixed>}>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function listTools(): array
|
||||
{
|
||||
$response = $this->http->post('/tools/list');
|
||||
$data = json_decode($response->getBody()->getContents(), true, 512, JSON_THROW_ON_ERROR);
|
||||
|
||||
return $data['tools'] ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $arguments
|
||||
* @return array<string, mixed>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function callTool(string $toolName, array $arguments): array
|
||||
{
|
||||
$response = $this->http->post('/tools/call', [
|
||||
'json' => [
|
||||
'name' => $toolName,
|
||||
'arguments' => $arguments,
|
||||
],
|
||||
]);
|
||||
|
||||
return json_decode($response->getBody()->getContents(), true, 512, JSON_THROW_ON_ERROR);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed>|null $config
|
||||
* @return array<string, mixed>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function extractFile(string $path, ?array $config = null): array
|
||||
{
|
||||
return $this->callTool('extract_file', [
|
||||
'path' => $path,
|
||||
'config' => $config,
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $paths
|
||||
* @param array<string, mixed>|null $config
|
||||
* @return array<string, mixed>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function batchExtractFiles(array $paths, ?array $config = null): array
|
||||
{
|
||||
return $this->callTool('batch_extract_files', [
|
||||
'paths' => $paths,
|
||||
'config' => $config,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
// Usage
|
||||
$client = new KreuzbergMcpClient('http://localhost:3000');
|
||||
|
||||
// Initialize connection
|
||||
$serverInfo = $client->initialize();
|
||||
echo "Connected to: {$serverInfo['serverInfo']['name']}\n";
|
||||
|
||||
// List available tools
|
||||
$tools = $client->listTools();
|
||||
$toolNames = array_column($tools, 'name');
|
||||
echo "Available tools: " . implode(', ', $toolNames) . "\n";
|
||||
|
||||
// Extract a file
|
||||
$result = $client->extractFile('document.pdf');
|
||||
echo "Extracted content length: " . strlen($result['content']) . "\n";
|
||||
|
||||
// Batch extract
|
||||
$results = $client->batchExtractFiles([
|
||||
'file1.pdf',
|
||||
'file2.docx',
|
||||
'file3.md',
|
||||
]);
|
||||
echo "Batch extracted " . count($results) . " files\n";
|
||||
```
|
||||
217
docs/snippets/php/mcp/mcp_laravel_integration.md
Normal file
217
docs/snippets/php/mcp/mcp_laravel_integration.md
Normal file
@@ -0,0 +1,217 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services;
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\GuzzleException;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use Psr\Log\LoggerInterface;
|
||||
|
||||
/**
|
||||
* Laravel service for Kreuzberg MCP integration.
|
||||
*
|
||||
* Register in AppServiceProvider:
|
||||
* $this->app->singleton(KreuzbergMcpService::class);
|
||||
*
|
||||
* Then inject in controllers or jobs.
|
||||
*/
|
||||
final class KreuzbergMcpService
|
||||
{
|
||||
private Client $http;
|
||||
|
||||
public function __construct(
|
||||
private readonly LoggerInterface $logger,
|
||||
private readonly string $mcpUrl = 'http://localhost:3000',
|
||||
) {
|
||||
$this->http = new Client([
|
||||
'base_uri' => $this->mcpUrl,
|
||||
'timeout' => config('kreuzberg.timeout', 30),
|
||||
'headers' => [
|
||||
'Content-Type' => 'application/json',
|
||||
'Accept' => 'application/json',
|
||||
],
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from a file via MCP server.
|
||||
*
|
||||
* @param array<string, mixed>|null $config
|
||||
* @return array{content: string, mimeType: string, metadata: array<string, mixed>}
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function extractFile(string $path, ?array $config = null): array
|
||||
{
|
||||
$cacheKey = 'kreuzberg_extract_' . md5($path . json_encode($config));
|
||||
|
||||
return Cache::remember($cacheKey, now()->addHours(24), function () use ($path, $config): array {
|
||||
$this->logger->info('Extracting file via MCP', [
|
||||
'path' => $path,
|
||||
'config' => $config,
|
||||
]);
|
||||
|
||||
try {
|
||||
$response = $this->http->post('/tools/call', [
|
||||
'json' => [
|
||||
'name' => 'extract_file',
|
||||
'arguments' => [
|
||||
'path' => $path,
|
||||
'config' => $config,
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$result = json_decode(
|
||||
$response->getBody()->getContents(),
|
||||
true,
|
||||
512,
|
||||
JSON_THROW_ON_ERROR
|
||||
);
|
||||
|
||||
$this->logger->info('Extraction successful', [
|
||||
'path' => $path,
|
||||
'content_length' => strlen($result['content'] ?? ''),
|
||||
]);
|
||||
|
||||
return $result;
|
||||
} catch (GuzzleException $e) {
|
||||
$this->logger->error('MCP extraction failed', [
|
||||
'path' => $path,
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
|
||||
throw $e;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract multiple files in batch.
|
||||
*
|
||||
* @param array<int, string> $paths
|
||||
* @param array<string, mixed>|null $config
|
||||
* @return array<int, array{content: string, mimeType: string}>
|
||||
* @throws GuzzleException
|
||||
*/
|
||||
public function batchExtractFiles(array $paths, ?array $config = null): array
|
||||
{
|
||||
$this->logger->info('Batch extracting files via MCP', [
|
||||
'count' => count($paths),
|
||||
]);
|
||||
|
||||
try {
|
||||
$response = $this->http->post('/tools/call', [
|
||||
'json' => [
|
||||
'name' => 'batch_extract_files',
|
||||
'arguments' => [
|
||||
'paths' => $paths,
|
||||
'config' => $config,
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$results = json_decode(
|
||||
$response->getBody()->getContents(),
|
||||
true,
|
||||
512,
|
||||
JSON_THROW_ON_ERROR
|
||||
);
|
||||
|
||||
$this->logger->info('Batch extraction successful', [
|
||||
'count' => count($results),
|
||||
]);
|
||||
|
||||
return $results;
|
||||
} catch (GuzzleException $e) {
|
||||
$this->logger->error('MCP batch extraction failed', [
|
||||
'count' => count($paths),
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
|
||||
throw $e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if MCP server is healthy.
|
||||
*/
|
||||
public function healthCheck(): bool
|
||||
{
|
||||
try {
|
||||
$response = $this->http->get('/health', ['timeout' => 2]);
|
||||
|
||||
return $response->getStatusCode() === 200;
|
||||
} catch (GuzzleException) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Usage in a Laravel controller
|
||||
namespace App\Http\Controllers;
|
||||
|
||||
use App\Services\KreuzbergMcpService;
|
||||
use Illuminate\Http\JsonResponse;
|
||||
use Illuminate\Http\Request;
|
||||
|
||||
final class DocumentController extends Controller
|
||||
{
|
||||
public function __construct(
|
||||
private readonly KreuzbergMcpService $kreuzberg,
|
||||
) {}
|
||||
|
||||
public function extract(Request $request): JsonResponse
|
||||
{
|
||||
$validated = $request->validate([
|
||||
'file_path' => 'required|string',
|
||||
'extract_tables' => 'boolean',
|
||||
'extract_images' => 'boolean',
|
||||
]);
|
||||
|
||||
$config = [
|
||||
'extractTables' => $validated['extract_tables'] ?? true,
|
||||
'extractImages' => $validated['extract_images'] ?? false,
|
||||
];
|
||||
|
||||
$result = $this->kreuzberg->extractFile(
|
||||
$validated['file_path'],
|
||||
$config
|
||||
);
|
||||
|
||||
return response()->json([
|
||||
'success' => true,
|
||||
'data' => $result,
|
||||
]);
|
||||
}
|
||||
|
||||
public function batchExtract(Request $request): JsonResponse
|
||||
{
|
||||
$validated = $request->validate([
|
||||
'file_paths' => 'required|array',
|
||||
'file_paths.*' => 'required|string',
|
||||
]);
|
||||
|
||||
$results = $this->kreuzberg->batchExtractFiles($validated['file_paths']);
|
||||
|
||||
return response()->json([
|
||||
'success' => true,
|
||||
'data' => $results,
|
||||
'count' => count($results),
|
||||
]);
|
||||
}
|
||||
|
||||
public function health(): JsonResponse
|
||||
{
|
||||
$healthy = $this->kreuzberg->healthCheck();
|
||||
|
||||
return response()->json([
|
||||
'healthy' => $healthy,
|
||||
'service' => 'kreuzberg-mcp',
|
||||
], $healthy ? 200 : 503);
|
||||
}
|
||||
}
|
||||
```
|
||||
33
docs/snippets/php/mcp/mcp_server_start.md
Normal file
33
docs/snippets/php/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Symfony\Component\Process\Process;
|
||||
|
||||
// Start the Kreuzberg MCP server as a background process
|
||||
$process = new Process(['kreuzberg', 'mcp']);
|
||||
$process->start();
|
||||
|
||||
$pid = $process->getPid();
|
||||
echo "MCP server started with PID: {$pid}\n";
|
||||
|
||||
// Wait for server to initialize
|
||||
sleep(1);
|
||||
|
||||
if ($process->isRunning()) {
|
||||
echo "Server is running, listening for connections\n";
|
||||
echo "Server output: " . $process->getOutput() . "\n";
|
||||
} else {
|
||||
echo "Server failed to start\n";
|
||||
echo "Error: " . $process->getErrorOutput() . "\n";
|
||||
}
|
||||
|
||||
// Keep process running or register shutdown handler
|
||||
register_shutdown_function(function () use ($process): void {
|
||||
if ($process->isRunning()) {
|
||||
$process->stop();
|
||||
echo "MCP server stopped\n";
|
||||
}
|
||||
});
|
||||
```
|
||||
30
docs/snippets/php/metadata/language_detection.md
Normal file
30
docs/snippets/php/metadata/language_detection.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
// Configure language detection with confidence threshold
|
||||
$langConfig = new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.7,
|
||||
detectMultiple: false
|
||||
);
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->language_detection = $langConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
// Access detected languages
|
||||
if (!empty($result->languages)) {
|
||||
foreach ($result->languages as $lang) {
|
||||
echo "Detected language: " . $lang->code . "\n";
|
||||
if ($lang->confidence !== null) {
|
||||
echo "Confidence: " . $lang->confidence . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
@@ -0,0 +1,37 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
// Configure multilingual language detection
|
||||
$langConfig = new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.6,
|
||||
detectMultiple: true
|
||||
);
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->language_detection = $langConfig;
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("multilingual_document.pdf", null, $config);
|
||||
|
||||
// Iterate through all detected languages
|
||||
if (!empty($result->languages)) {
|
||||
echo "Detected " . count($result->languages) . " language(s):\n";
|
||||
|
||||
foreach ($result->languages as $lang) {
|
||||
echo "Language: " . $lang->code . "\n";
|
||||
if ($lang->confidence !== null) {
|
||||
printf(" Confidence: %.1f%%\n", $lang->confidence * 100);
|
||||
}
|
||||
if ($lang->name !== null) {
|
||||
echo " Name: " . $lang->name . "\n";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
echo "No languages detected\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
63
docs/snippets/php/metadata/metadata.md
Normal file
63
docs/snippets/php/metadata/metadata.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// Extract PDF metadata
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, new ExtractionConfig());
|
||||
|
||||
if ($result->metadata?->pdf) {
|
||||
$pdfMeta = $result->metadata->pdf;
|
||||
if ($pdfMeta->page_count !== null) {
|
||||
echo "Pages: " . $pdfMeta->page_count . "\n";
|
||||
}
|
||||
if ($pdfMeta->author !== null) {
|
||||
echo "Author: " . $pdfMeta->author . "\n";
|
||||
}
|
||||
if ($pdfMeta->title !== null) {
|
||||
echo "Title: " . $pdfMeta->title . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Extract HTML metadata
|
||||
$htmlResult = Kreuzberg::extract_file_sync("page.html", null, new ExtractionConfig());
|
||||
|
||||
if ($htmlResult->metadata?->html) {
|
||||
$htmlMeta = $htmlResult->metadata->html;
|
||||
if ($htmlMeta->title !== null) {
|
||||
echo "Title: " . $htmlMeta->title . "\n";
|
||||
}
|
||||
if ($htmlMeta->description !== null) {
|
||||
echo "Description: " . $htmlMeta->description . "\n";
|
||||
}
|
||||
|
||||
// Access keywords array
|
||||
echo "Keywords: " . implode(", ", $htmlMeta->keywords ?? []) . "\n";
|
||||
|
||||
// Access canonical URL
|
||||
if ($htmlMeta->canonical_url !== null) {
|
||||
echo "Canonical: " . $htmlMeta->canonical_url . "\n";
|
||||
}
|
||||
|
||||
// Access Open Graph fields
|
||||
if (!empty($htmlMeta->open_graph)) {
|
||||
if (isset($htmlMeta->open_graph["image"])) {
|
||||
echo "OG Image: " . $htmlMeta->open_graph["image"] . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
// Access language
|
||||
if ($htmlMeta->language !== null) {
|
||||
echo "Language: " . $htmlMeta->language . "\n";
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if (!empty($htmlMeta->headers)) {
|
||||
foreach ($htmlMeta->headers as $header) {
|
||||
echo "Header (level " . $header->level . "): " . $header->text . "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
33
docs/snippets/php/metadata/metadata.php
Normal file
33
docs/snippets/php/metadata/metadata.php
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="metadata.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Document Metadata Access
|
||||
*
|
||||
* Extract and access metadata from different document types including
|
||||
* PDFs, HTML, and other formats.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
|
||||
if (isset($result->metadata->pdf)) {
|
||||
$pdfMeta = $result->metadata->pdf;
|
||||
echo "Pages: " . ($pdfMeta['page_count'] ?? 'N/A') . "\n";
|
||||
echo "Author: " . ($pdfMeta['author'] ?? 'N/A') . "\n";
|
||||
echo "Title: " . ($pdfMeta['title'] ?? 'N/A') . "\n";
|
||||
}
|
||||
|
||||
$htmlResult = extract_file('page.html');
|
||||
|
||||
if (isset($htmlResult->metadata->html)) {
|
||||
$htmlMeta = $htmlResult->metadata->html;
|
||||
echo "Title: " . ($htmlMeta['title'] ?? 'N/A') . "\n";
|
||||
echo "Description: " . ($htmlMeta['description'] ?? 'N/A') . "\n";
|
||||
}
|
||||
```
|
||||
29
docs/snippets/php/metadata/page_boundaries.md
Normal file
29
docs/snippets/php/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PageConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$config->pages = new PageConfig(
|
||||
extractPages: true,
|
||||
insertPageMarkers: true,
|
||||
markerFormat: "\n\n=== PAGE {page_num} ===\n\n"
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extract_file_sync("document.pdf", null, $config);
|
||||
|
||||
// Content with inline page markers
|
||||
echo "Full content with markers:\n";
|
||||
echo $result->content . "\n\n";
|
||||
|
||||
// Or access pages separately with boundaries preserved
|
||||
if ($result->pages !== null) {
|
||||
foreach ($result->pages as $page) {
|
||||
echo "--- Page " . $page->page_number . " (boundary) ---\n";
|
||||
echo $page->content . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
37
docs/snippets/php/metadata/page_boundaries.php
Normal file
37
docs/snippets/php/metadata/page_boundaries.php
Normal file
@@ -0,0 +1,37 @@
|
||||
```php title="page_boundaries.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Page Boundary Tracking
|
||||
*
|
||||
* Access page boundary information to extract content from specific pages
|
||||
* using byte offsets in the extracted content.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
$result = extract_file('document.pdf');
|
||||
|
||||
if (isset($result->metadata->pages->boundaries) && !empty($result->metadata->pages->boundaries)) {
|
||||
$boundaries = $result->metadata->pages->boundaries;
|
||||
$contentBytes = $result->content;
|
||||
|
||||
$pagesToShow = array_slice($boundaries, 0, 3);
|
||||
|
||||
foreach ($pagesToShow as $boundary) {
|
||||
$pageContent = mb_substr(
|
||||
$contentBytes,
|
||||
$boundary->byteStart,
|
||||
$boundary->byteEnd - $boundary->byteStart
|
||||
);
|
||||
|
||||
echo "Page {$boundary->pageNumber}:\n";
|
||||
echo " Byte range: {$boundary->byteStart}-{$boundary->byteEnd}\n";
|
||||
echo " Preview: " . mb_substr($pageContent, 0, 100) . "...\n\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user