Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,19 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\BatchBytesItem;
$config = new ExtractionConfig();
$items = [
new BatchBytesItem('Hello, world!', 'text/plain'),
new BatchBytesItem("# Heading\n\nParagraph text.", 'text/markdown'),
];
$results = Kreuzberg::batchExtractBytesSync($items, $config);
foreach ($results as $i => $result) {
echo "Item $i: " . strlen($result->getContent()) . " chars\n";
}
```

View File

@@ -0,0 +1,20 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\BatchFileItem;
$config = new ExtractionConfig();
$items = [
new BatchFileItem('doc1.pdf'),
new BatchFileItem('doc2.docx'),
new BatchFileItem('report.pdf'),
];
$results = Kreuzberg::batchExtractFilesSync($items, $config);
foreach ($results as $i => $result) {
echo "File $i: " . strlen($result->getContent()) . " chars\n";
}
```

View File

@@ -0,0 +1,37 @@
```php title="PHP"
<?php
declare(strict_types=1);
use GuzzleHttp\Client;
$client = new Client();
$filePath = 'document.pdf';
$fileContent = file_get_contents($filePath);
try {
$response = $client->post('http://localhost:8000/extract', [
'multipart' => [
[
'name' => 'file',
'contents' => $fileContent,
'filename' => basename($filePath),
'headers' => ['Content-Type' => 'application/pdf'],
],
[
'name' => 'chunking',
'contents' => json_encode(['max_characters' => 800, 'overlap' => 100]),
],
],
]);
$result = json_decode((string)$response->getBody(), true);
if (isset($result['chunks']) && is_array($result['chunks'])) {
echo count($result['chunks']) . " chunks\n";
foreach ($result['chunks'] as $chunk) {
echo " " . strlen($chunk['content'] ?? '') . " chars\n";
}
}
} catch (Exception $e) {
echo "Request failed: " . $e->getMessage() . "\n";
}
```

View File

@@ -0,0 +1,28 @@
```php title="PHP"
<?php
declare(strict_types=1);
use GuzzleHttp\Client;
$client = new Client();
$filePath = 'document.pdf';
$fileContent = file_get_contents($filePath);
try {
$response = $client->post('http://localhost:8000/extract', [
'multipart' => [
[
'name' => 'file',
'contents' => $fileContent,
'filename' => basename($filePath),
'headers' => ['Content-Type' => 'application/pdf'],
],
],
]);
$result = json_decode((string)$response->getBody(), true);
echo $result['content'] ?? '';
} catch (Exception $e) {
echo "Request failed: " . $e->getMessage() . "\n";
}
```

View File

@@ -0,0 +1,80 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\OcrConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\ChunkSizing;
use Kreuzberg\ImageExtractionConfig;
use Kreuzberg\OutputFormat;
// Build config with OCR, chunking, and image extraction
$config = new ExtractionConfig(
null, // caching
false, // force_ocr
null, // max_concurrent_extractions
null, // cache_dir
OutputFormat::Markdown, // output_format
true, // include_document_structure
true, // enable_quality_processing
true, // use_cache
null, // use_diffs
null, // keep_empty_chunks
);
// Set OCR: Tesseract with English language
$ocrConfig = new OcrConfig(
'tesseract', // backend
'eng', // language
null, // page_count_hint
null, // psm_mode
null, // use_gpu
null, // languages
null, // fast_mode
null, // fast_weight
null, // min_confidence
);
$config->setOcr($ocrConfig);
// Set chunking: semantic markdown chunks ~800 chars, 100-char overlap
$chunkingConfig = new ChunkingConfig(
800, // max_characters
100, // overlap
true, // trim
'Markdown', // chunker_type
null, // preset
true, // prepend_heading_context
null, // topic_threshold
);
$config->setChunking($chunkingConfig);
// Set image extraction
$imageConfig = new ImageExtractionConfig(
true, // extract_images
null, // image_min_width
null, // image_min_height
null, // image_output_format
null, // image_compression_level
);
$config->setImages($imageConfig);
$result = Kreuzberg::extractFileSync('report.pdf', null, $config);
echo "Content (" . strlen($result->getContent()) . " chars):\n";
echo substr($result->getContent(), 0, 200) . "\n\n";
if ($result->getChunks() !== null) {
echo "Chunks: " . count($result->getChunks()) . "\n";
}
echo "Tables: " . count($result->getTables()) . "\n";
if ($result->getDetectedLanguages() !== null) {
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
}
if ($result->getExtractionMethod() !== null) {
echo "Extraction method: " . $result->getExtractionMethod() . "\n";
}
```

View File

@@ -0,0 +1,19 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\KreuzbergException;
$config = new ExtractionConfig();
try {
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo $result->getContent();
} catch (KreuzbergException $e) {
// The extension throws KreuzbergException with the error message
// Error context is available in the exception message
echo "Extraction failed: " . $e->getMessage() . "\n";
echo "Error code: " . $e->getCode() . "\n";
}
```

View File

@@ -0,0 +1,31 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\KreuzbergException;
function extract_text(string $bytes, string $mime_type): string {
$config = new ExtractionConfig();
$result = Kreuzberg::extractBytesSync($bytes, $mime_type, $config);
return $result->getContent();
}
$bytes = file_get_contents('document.pdf') ?: '';
try {
$text = extract_text($bytes, 'application/pdf');
echo "Extracted " . strlen($text) . " chars\n";
} catch (KreuzbergException $e) {
// All Kreuzberg errors are KreuzbergException
// Check the message for error type details
$message = $e->getMessage();
if (strpos($message, 'not supported') !== false) {
echo "Format not supported\n";
} elseif (strpos($message, 'OCR') !== false) {
echo "OCR failed: " . $message . "\n";
} else {
echo "Error: " . $message . "\n";
}
}
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
// PHP does not have native async/await. The ext-php-rs binding blocks internally
// using tokio::task::block_on. For concurrent operations, use batchExtractBytesSync
// or batchExtractBytesAsync with multiple items instead.
$content = file_get_contents('document.pdf');
$config = new ExtractionConfig();
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
$result = Kreuzberg::extractBytesAsync($content, 'application/pdf', $config);
echo $result->getContent();
echo 'Tables: ' . count($result->getTables()) . "\n";
```

View File

@@ -0,0 +1,14 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$content = file_get_contents('document.pdf');
$config = new ExtractionConfig();
$result = Kreuzberg::extractBytesSync($content, 'application/pdf', $config);
echo $result->getContent();
echo 'Tables: ' . count($result->getTables()) . "\n";
```

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
// PHP does not have native async/await. The ext-php-rs binding blocks internally
// using tokio::task::block_on. This behaves like the sync version in PHP.
$config = new ExtractionConfig();
// Note: This is labeled "async" in the API but blocks in PHP like the sync version
$result = Kreuzberg::extractFileAsync('document.pdf', null, $config);
echo $result->getContent();
echo 'MIME type: ' . $result->getMimeType() . "\n";
echo 'Tables: ' . count($result->getTables()) . "\n";
```

View File

@@ -0,0 +1,14 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$config = new ExtractionConfig();
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo $result->getContent();
echo 'MIME type: ' . $result->getMimeType() . "\n";
echo 'Tables: ' . count($result->getTables()) . "\n";
```