Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/php/api/batch_extract_bytes_sync.md
+++ b/docs/snippets/php/api/batch_extract_bytes_sync.md
@@ -0,0 +1,19 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\BatchBytesItem;
+
+$config = new ExtractionConfig();
+$items = [
+    new BatchBytesItem('Hello, world!', 'text/plain'),
+    new BatchBytesItem("# Heading\n\nParagraph text.", 'text/markdown'),
+];
+$results = Kreuzberg::batchExtractBytesSync($items, $config);
+
+foreach ($results as $i => $result) {
+    echo "Item $i: " . strlen($result->getContent()) . " chars\n";
+}
+```
--- a/docs/snippets/php/api/batch_extract_files_sync.md
+++ b/docs/snippets/php/api/batch_extract_files_sync.md
@@ -0,0 +1,20 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\BatchFileItem;
+
+$config = new ExtractionConfig();
+$items = [
+    new BatchFileItem('doc1.pdf'),
+    new BatchFileItem('doc2.docx'),
+    new BatchFileItem('report.pdf'),
+];
+$results = Kreuzberg::batchExtractFilesSync($items, $config);
+
+foreach ($results as $i => $result) {
+    echo "File $i: " . strlen($result->getContent()) . " chars\n";
+}
+```
--- a/docs/snippets/php/api/client_chunk_text.md
+++ b/docs/snippets/php/api/client_chunk_text.md
@@ -0,0 +1,37 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use GuzzleHttp\Client;
+
+$client = new Client();
+$filePath = 'document.pdf';
+$fileContent = file_get_contents($filePath);
+
+try {
+    $response = $client->post('http://localhost:8000/extract', [
+        'multipart' => [
+            [
+                'name' => 'file',
+                'contents' => $fileContent,
+                'filename' => basename($filePath),
+                'headers' => ['Content-Type' => 'application/pdf'],
+            ],
+            [
+                'name' => 'chunking',
+                'contents' => json_encode(['max_characters' => 800, 'overlap' => 100]),
+            ],
+        ],
+    ]);
+
+    $result = json_decode((string)$response->getBody(), true);
+    if (isset($result['chunks']) && is_array($result['chunks'])) {
+        echo count($result['chunks']) . " chunks\n";
+        foreach ($result['chunks'] as $chunk) {
+            echo "  " . strlen($chunk['content'] ?? '') . " chars\n";
+        }
+    }
+} catch (Exception $e) {
+    echo "Request failed: " . $e->getMessage() . "\n";
+}
+```
--- a/docs/snippets/php/api/client_extract_single_file.md
+++ b/docs/snippets/php/api/client_extract_single_file.md
@@ -0,0 +1,28 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use GuzzleHttp\Client;
+
+$client = new Client();
+$filePath = 'document.pdf';
+$fileContent = file_get_contents($filePath);
+
+try {
+    $response = $client->post('http://localhost:8000/extract', [
+        'multipart' => [
+            [
+                'name' => 'file',
+                'contents' => $fileContent,
+                'filename' => basename($filePath),
+                'headers' => ['Content-Type' => 'application/pdf'],
+            ],
+        ],
+    ]);
+
+    $result = json_decode((string)$response->getBody(), true);
+    echo $result['content'] ?? '';
+} catch (Exception $e) {
+    echo "Request failed: " . $e->getMessage() . "\n";
+}
+```
--- a/docs/snippets/php/api/combining_all_features.md
+++ b/docs/snippets/php/api/combining_all_features.md
@@ -0,0 +1,80 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\OcrConfig;
+use Kreuzberg\ChunkingConfig;
+use Kreuzberg\ChunkSizing;
+use Kreuzberg\ImageExtractionConfig;
+use Kreuzberg\OutputFormat;
+
+// Build config with OCR, chunking, and image extraction
+$config = new ExtractionConfig(
+    null,                                    // caching
+    false,                                   // force_ocr
+    null,                                    // max_concurrent_extractions
+    null,                                    // cache_dir
+    OutputFormat::Markdown,                  // output_format
+    true,                                    // include_document_structure
+    true,                                    // enable_quality_processing
+    true,                                    // use_cache
+    null,                                    // use_diffs
+    null,                                    // keep_empty_chunks
+);
+
+// Set OCR: Tesseract with English language
+$ocrConfig = new OcrConfig(
+    'tesseract',                             // backend
+    'eng',                                   // language
+    null,                                    // page_count_hint
+    null,                                    // psm_mode
+    null,                                    // use_gpu
+    null,                                    // languages
+    null,                                    // fast_mode
+    null,                                    // fast_weight
+    null,                                    // min_confidence
+);
+$config->setOcr($ocrConfig);
+
+// Set chunking: semantic markdown chunks ~800 chars, 100-char overlap
+$chunkingConfig = new ChunkingConfig(
+    800,                                     // max_characters
+    100,                                     // overlap
+    true,                                    // trim
+    'Markdown',                              // chunker_type
+    null,                                    // preset
+    true,                                    // prepend_heading_context
+    null,                                    // topic_threshold
+);
+$config->setChunking($chunkingConfig);
+
+// Set image extraction
+$imageConfig = new ImageExtractionConfig(
+    true,                                    // extract_images
+    null,                                    // image_min_width
+    null,                                    // image_min_height
+    null,                                    // image_output_format
+    null,                                    // image_compression_level
+);
+$config->setImages($imageConfig);
+
+$result = Kreuzberg::extractFileSync('report.pdf', null, $config);
+
+echo "Content (" . strlen($result->getContent()) . " chars):\n";
+echo substr($result->getContent(), 0, 200) . "\n\n";
+
+if ($result->getChunks() !== null) {
+    echo "Chunks: " . count($result->getChunks()) . "\n";
+}
+echo "Tables: " . count($result->getTables()) . "\n";
+
+if ($result->getDetectedLanguages() !== null) {
+    echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
+}
+
+if ($result->getExtractionMethod() !== null) {
+    echo "Extraction method: " . $result->getExtractionMethod() . "\n";
+}
+```
--- a/docs/snippets/php/api/error_handling.md
+++ b/docs/snippets/php/api/error_handling.md
@@ -0,0 +1,19 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\KreuzbergException;
+
+$config = new ExtractionConfig();
+try {
+    $result = Kreuzberg::extractFileSync('document.pdf', null, $config);
+    echo $result->getContent();
+} catch (KreuzbergException $e) {
+    // The extension throws KreuzbergException with the error message
+    // Error context is available in the exception message
+    echo "Extraction failed: " . $e->getMessage() . "\n";
+    echo "Error code: " . $e->getCode() . "\n";
+}
+```
--- a/docs/snippets/php/api/error_handling_extract.md
+++ b/docs/snippets/php/api/error_handling_extract.md
@@ -0,0 +1,31 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+use Kreuzberg\KreuzbergException;
+
+function extract_text(string $bytes, string $mime_type): string {
+    $config = new ExtractionConfig();
+    $result = Kreuzberg::extractBytesSync($bytes, $mime_type, $config);
+    return $result->getContent();
+}
+
+$bytes = file_get_contents('document.pdf') ?: '';
+try {
+    $text = extract_text($bytes, 'application/pdf');
+    echo "Extracted " . strlen($text) . " chars\n";
+} catch (KreuzbergException $e) {
+    // All Kreuzberg errors are KreuzbergException
+    // Check the message for error type details
+    $message = $e->getMessage();
+    if (strpos($message, 'not supported') !== false) {
+        echo "Format not supported\n";
+    } elseif (strpos($message, 'OCR') !== false) {
+        echo "OCR failed: " . $message . "\n";
+    } else {
+        echo "Error: " . $message . "\n";
+    }
+}
+```
--- a/docs/snippets/php/api/extract_bytes_async.md
+++ b/docs/snippets/php/api/extract_bytes_async.md
@@ -0,0 +1,21 @@
+<!-- snippet:syntax-only -->
+
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+// PHP does not have native async/await. The ext-php-rs binding blocks internally
+// using tokio::task::block_on. For concurrent operations, use batchExtractBytesSync
+// or batchExtractBytesAsync with multiple items instead.
+
+$content = file_get_contents('document.pdf');
+$config = new ExtractionConfig();
+// Note: This is labeled "async" in the API but blocks in PHP like the sync version
+$result = Kreuzberg::extractBytesAsync($content, 'application/pdf', $config);
+
+echo $result->getContent();
+echo 'Tables: ' . count($result->getTables()) . "\n";
+```
--- a/docs/snippets/php/api/extract_bytes_sync.md
+++ b/docs/snippets/php/api/extract_bytes_sync.md
@@ -0,0 +1,14 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+$content = file_get_contents('document.pdf');
+$config = new ExtractionConfig();
+$result = Kreuzberg::extractBytesSync($content, 'application/pdf', $config);
+
+echo $result->getContent();
+echo 'Tables: ' . count($result->getTables()) . "\n";
+```
--- a/docs/snippets/php/api/extract_file_async.md
+++ b/docs/snippets/php/api/extract_file_async.md
@@ -0,0 +1,20 @@
+<!-- snippet:syntax-only -->
+
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+// PHP does not have native async/await. The ext-php-rs binding blocks internally
+// using tokio::task::block_on. This behaves like the sync version in PHP.
+
+$config = new ExtractionConfig();
+// Note: This is labeled "async" in the API but blocks in PHP like the sync version
+$result = Kreuzberg::extractFileAsync('document.pdf', null, $config);
+
+echo $result->getContent();
+echo 'MIME type: ' . $result->getMimeType() . "\n";
+echo 'Tables: ' . count($result->getTables()) . "\n";
+```
--- a/docs/snippets/php/api/extract_file_sync.md
+++ b/docs/snippets/php/api/extract_file_sync.md
@@ -0,0 +1,14 @@
+```php title="PHP"
+<?php
+declare(strict_types=1);
+
+use Kreuzberg\Kreuzberg;
+use Kreuzberg\ExtractionConfig;
+
+$config = new ExtractionConfig();
+$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
+
+echo $result->getContent();
+echo 'MIME type: ' . $result->getMimeType() . "\n";
+echo 'Tables: ' . count($result->getTables()) . "\n";
+```