This commit is contained in:
19
docs/snippets/php/api/batch_extract_bytes_sync.md
Normal file
19
docs/snippets/php/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\BatchBytesItem;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$items = [
|
||||
new BatchBytesItem('Hello, world!', 'text/plain'),
|
||||
new BatchBytesItem("# Heading\n\nParagraph text.", 'text/markdown'),
|
||||
];
|
||||
$results = Kreuzberg::batchExtractBytesSync($items, $config);
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "Item $i: " . strlen($result->getContent()) . " chars\n";
|
||||
}
|
||||
```
|
||||
20
docs/snippets/php/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/php/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\BatchFileItem;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$items = [
|
||||
new BatchFileItem('doc1.pdf'),
|
||||
new BatchFileItem('doc2.docx'),
|
||||
new BatchFileItem('report.pdf'),
|
||||
];
|
||||
$results = Kreuzberg::batchExtractFilesSync($items, $config);
|
||||
|
||||
foreach ($results as $i => $result) {
|
||||
echo "File $i: " . strlen($result->getContent()) . " chars\n";
|
||||
}
|
||||
```
|
||||
37
docs/snippets/php/api/client_chunk_text.md
Normal file
37
docs/snippets/php/api/client_chunk_text.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
|
||||
$client = new Client();
|
||||
$filePath = 'document.pdf';
|
||||
$fileContent = file_get_contents($filePath);
|
||||
|
||||
try {
|
||||
$response = $client->post('http://localhost:8000/extract', [
|
||||
'multipart' => [
|
||||
[
|
||||
'name' => 'file',
|
||||
'contents' => $fileContent,
|
||||
'filename' => basename($filePath),
|
||||
'headers' => ['Content-Type' => 'application/pdf'],
|
||||
],
|
||||
[
|
||||
'name' => 'chunking',
|
||||
'contents' => json_encode(['max_characters' => 800, 'overlap' => 100]),
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$result = json_decode((string)$response->getBody(), true);
|
||||
if (isset($result['chunks']) && is_array($result['chunks'])) {
|
||||
echo count($result['chunks']) . " chunks\n";
|
||||
foreach ($result['chunks'] as $chunk) {
|
||||
echo " " . strlen($chunk['content'] ?? '') . " chars\n";
|
||||
}
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
echo "Request failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
```
|
||||
28
docs/snippets/php/api/client_extract_single_file.md
Normal file
28
docs/snippets/php/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
|
||||
$client = new Client();
|
||||
$filePath = 'document.pdf';
|
||||
$fileContent = file_get_contents($filePath);
|
||||
|
||||
try {
|
||||
$response = $client->post('http://localhost:8000/extract', [
|
||||
'multipart' => [
|
||||
[
|
||||
'name' => 'file',
|
||||
'contents' => $fileContent,
|
||||
'filename' => basename($filePath),
|
||||
'headers' => ['Content-Type' => 'application/pdf'],
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$result = json_decode((string)$response->getBody(), true);
|
||||
echo $result['content'] ?? '';
|
||||
} catch (Exception $e) {
|
||||
echo "Request failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
```
|
||||
80
docs/snippets/php/api/combining_all_features.md
Normal file
80
docs/snippets/php/api/combining_all_features.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\ChunkSizing;
|
||||
use Kreuzberg\ImageExtractionConfig;
|
||||
use Kreuzberg\OutputFormat;
|
||||
|
||||
// Build config with OCR, chunking, and image extraction
|
||||
$config = new ExtractionConfig(
|
||||
null, // caching
|
||||
false, // force_ocr
|
||||
null, // max_concurrent_extractions
|
||||
null, // cache_dir
|
||||
OutputFormat::Markdown, // output_format
|
||||
true, // include_document_structure
|
||||
true, // enable_quality_processing
|
||||
true, // use_cache
|
||||
null, // use_diffs
|
||||
null, // keep_empty_chunks
|
||||
);
|
||||
|
||||
// Set OCR: Tesseract with English language
|
||||
$ocrConfig = new OcrConfig(
|
||||
'tesseract', // backend
|
||||
'eng', // language
|
||||
null, // page_count_hint
|
||||
null, // psm_mode
|
||||
null, // use_gpu
|
||||
null, // languages
|
||||
null, // fast_mode
|
||||
null, // fast_weight
|
||||
null, // min_confidence
|
||||
);
|
||||
$config->setOcr($ocrConfig);
|
||||
|
||||
// Set chunking: semantic markdown chunks ~800 chars, 100-char overlap
|
||||
$chunkingConfig = new ChunkingConfig(
|
||||
800, // max_characters
|
||||
100, // overlap
|
||||
true, // trim
|
||||
'Markdown', // chunker_type
|
||||
null, // preset
|
||||
true, // prepend_heading_context
|
||||
null, // topic_threshold
|
||||
);
|
||||
$config->setChunking($chunkingConfig);
|
||||
|
||||
// Set image extraction
|
||||
$imageConfig = new ImageExtractionConfig(
|
||||
true, // extract_images
|
||||
null, // image_min_width
|
||||
null, // image_min_height
|
||||
null, // image_output_format
|
||||
null, // image_compression_level
|
||||
);
|
||||
$config->setImages($imageConfig);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('report.pdf', null, $config);
|
||||
|
||||
echo "Content (" . strlen($result->getContent()) . " chars):\n";
|
||||
echo substr($result->getContent(), 0, 200) . "\n\n";
|
||||
|
||||
if ($result->getChunks() !== null) {
|
||||
echo "Chunks: " . count($result->getChunks()) . "\n";
|
||||
}
|
||||
echo "Tables: " . count($result->getTables()) . "\n";
|
||||
|
||||
if ($result->getDetectedLanguages() !== null) {
|
||||
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
|
||||
if ($result->getExtractionMethod() !== null) {
|
||||
echo "Extraction method: " . $result->getExtractionMethod() . "\n";
|
||||
}
|
||||
```
|
||||
19
docs/snippets/php/api/error_handling.md
Normal file
19
docs/snippets/php/api/error_handling.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KreuzbergException;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
try {
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
echo $result->getContent();
|
||||
} catch (KreuzbergException $e) {
|
||||
// The extension throws KreuzbergException with the error message
|
||||
// Error context is available in the exception message
|
||||
echo "Extraction failed: " . $e->getMessage() . "\n";
|
||||
echo "Error code: " . $e->getCode() . "\n";
|
||||
}
|
||||
```
|
||||
31
docs/snippets/php/api/error_handling_extract.md
Normal file
31
docs/snippets/php/api/error_handling_extract.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KreuzbergException;
|
||||
|
||||
function extract_text(string $bytes, string $mime_type): string {
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractBytesSync($bytes, $mime_type, $config);
|
||||
return $result->getContent();
|
||||
}
|
||||
|
||||
$bytes = file_get_contents('document.pdf') ?: '';
|
||||
try {
|
||||
$text = extract_text($bytes, 'application/pdf');
|
||||
echo "Extracted " . strlen($text) . " chars\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
// All Kreuzberg errors are KreuzbergException
|
||||
// Check the message for error type details
|
||||
$message = $e->getMessage();
|
||||
if (strpos($message, 'not supported') !== false) {
|
||||
echo "Format not supported\n";
|
||||
} elseif (strpos($message, 'OCR') !== false) {
|
||||
echo "OCR failed: " . $message . "\n";
|
||||
} else {
|
||||
echo "Error: " . $message . "\n";
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/php/api/extract_bytes_async.md
Normal file
21
docs/snippets/php/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// PHP does not have native async/await. The ext-php-rs binding blocks internally
|
||||
// using tokio::task::block_on. For concurrent operations, use batchExtractBytesSync
|
||||
// or batchExtractBytesAsync with multiple items instead.
|
||||
|
||||
$content = file_get_contents('document.pdf');
|
||||
$config = new ExtractionConfig();
|
||||
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
|
||||
$result = Kreuzberg::extractBytesAsync($content, 'application/pdf', $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
14
docs/snippets/php/api/extract_bytes_sync.md
Normal file
14
docs/snippets/php/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$content = file_get_contents('document.pdf');
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractBytesSync($content, 'application/pdf', $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
20
docs/snippets/php/api/extract_file_async.md
Normal file
20
docs/snippets/php/api/extract_file_async.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
// PHP does not have native async/await. The ext-php-rs binding blocks internally
|
||||
// using tokio::task::block_on. This behaves like the sync version in PHP.
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
|
||||
$result = Kreuzberg::extractFileAsync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'MIME type: ' . $result->getMimeType() . "\n";
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
14
docs/snippets/php/api/extract_file_sync.md
Normal file
14
docs/snippets/php/api/extract_file_sync.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
echo 'MIME type: ' . $result->getMimeType() . "\n";
|
||||
echo 'Tables: ' . count($result->getTables()) . "\n";
|
||||
```
|
||||
Reference in New Issue
Block a user