This commit is contained in:
47
docs/snippets/php/config/advanced_config.md
Normal file
47
docs/snippets/php/config/advanced_config.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
use Kreuzberg\PostProcessorConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
// Advanced configuration combining multiple features
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
),
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
),
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false
|
||||
),
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
),
|
||||
postprocessor: new PostProcessorConfig(
|
||||
enabled: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
if ($result->getDetectedLanguages()) {
|
||||
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
56
docs/snippets/php/config/chunking_config.md
Normal file
56
docs/snippets/php/config/chunking_config.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
// Basic chunking
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Number of chunks: " . count($result->getChunks()) . "\n";
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
|
||||
```php title="PHP - Markdown with Heading Context"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 500,
|
||||
overlap: 50,
|
||||
chunkerType: 'markdown',
|
||||
prependHeadingContext: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.md', null, $config);
|
||||
|
||||
foreach ($result->getChunks() as $chunk) {
|
||||
$metadata = $chunk->getMetadata();
|
||||
if ($metadata && $metadata->getHeadingContext()) {
|
||||
$headings = $metadata->getHeadingContext()->getHeadings();
|
||||
foreach ($headings as $heading) {
|
||||
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
|
||||
}
|
||||
}
|
||||
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
17
docs/snippets/php/config/config_basic.md
Normal file
17
docs/snippets/php/config/config_basic.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
13
docs/snippets/php/config/config_discover.md
Normal file
13
docs/snippets/php/config/config_discover.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Discover configuration from file system
|
||||
$config = ExtractionConfig::discover() ?? new ExtractionConfig();
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/config_ocr.md
Normal file
21
docs/snippets/php/config/config_ocr.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
echo "Tables detected: " . count($result->getTables()) . "\n";
|
||||
?>
|
||||
```
|
||||
29
docs/snippets/php/config/config_programmatic.md
Normal file
29
docs/snippets/php/config/config_programmatic.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
useCache: true,
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
tesseractConfig: new TesseractConfig(psm: 6)
|
||||
),
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200
|
||||
),
|
||||
enableQualityProcessing: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Content length: " . strlen($result->getContent()) . " characters\n";
|
||||
?>
|
||||
```
|
||||
16
docs/snippets/php/config/document_structure_config.md
Normal file
16
docs/snippets/php/config/document_structure_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```php title="Document Structure Config (PHP)"
|
||||
<?php
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$config = new ExtractionConfig(includeDocumentStructure: true);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', $config);
|
||||
|
||||
if ($result->document !== null) {
|
||||
foreach ($result->document->nodes as $node) {
|
||||
echo "[{$node->content->nodeType}]\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
42
docs/snippets/php/config/element_based_output.md
Normal file
42
docs/snippets/php/config/element_based_output.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```php title="Element-Based Output (PHP)"
|
||||
<?php
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Configure element-based output
|
||||
$config = new ExtractionConfig();
|
||||
$config->setOutputFormat('element_based');
|
||||
|
||||
// Extract document
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', $config);
|
||||
|
||||
// Access elements
|
||||
foreach ($result->getElements() as $element) {
|
||||
echo "Type: " . $element->getElementType() . "\n";
|
||||
echo "Text: " . substr($element->getText(), 0, 100) . "\n";
|
||||
|
||||
if ($element->getMetadata()->getPageNumber()) {
|
||||
echo "Page: " . $element->getMetadata()->getPageNumber() . "\n";
|
||||
}
|
||||
|
||||
if ($element->getMetadata()->getCoordinates()) {
|
||||
$coords = $element->getMetadata()->getCoordinates();
|
||||
echo sprintf("Coords: (%s, %s) - (%s, %s)\n",
|
||||
$coords->getLeft(), $coords->getTop(),
|
||||
$coords->getRight(), $coords->getBottom());
|
||||
}
|
||||
|
||||
echo "---\n";
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
$titles = array_filter($result->getElements(), function($e) {
|
||||
return $e->getElementType() === 'title';
|
||||
});
|
||||
|
||||
foreach ($titles as $title) {
|
||||
$level = $title->getMetadata()->getAdditional()['level'] ?? 'unknown';
|
||||
echo "[{$level}] {$title->getText()}\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
27
docs/snippets/php/config/embedding_config.md
Normal file
27
docs/snippets/php/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ChunkingConfig;
|
||||
use Kreuzberg\EmbeddingConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
chunking: new ChunkingConfig(
|
||||
maxCharacters: 1000,
|
||||
overlap: 200,
|
||||
embedding: new EmbeddingConfig(
|
||||
model: 'balanced',
|
||||
batchSize: 16,
|
||||
normalize: true,
|
||||
showDownloadProgress: true
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Chunks with embeddings: " . count($result->getChunks()) . "\n";
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/html_output.md
Normal file
21
docs/snippets/php/config/html_output.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\HtmlOutputConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
resultFormat: 'html',
|
||||
htmlOutput: new HtmlOutputConfig(
|
||||
theme: 'github'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
// Output HTML with kb-* CSS classes
|
||||
echo $result->getContent();
|
||||
?>
|
||||
```
|
||||
26
docs/snippets/php/config/keyword_extraction_config.md
Normal file
26
docs/snippets/php/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\KeywordConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
keywords: new KeywordConfig(
|
||||
algorithm: 'yake',
|
||||
maxKeywords: 10,
|
||||
minScore: 0.1,
|
||||
language: 'en'
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
if ($result->getKeywords()) {
|
||||
foreach ($result->getKeywords() as $keyword) {
|
||||
echo $keyword . "\n";
|
||||
}
|
||||
}
|
||||
?>
|
||||
```
|
||||
22
docs/snippets/php/config/language_detection_config.md
Normal file
22
docs/snippets/php/config/language_detection_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\LanguageDetectionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
languageDetection: new LanguageDetectionConfig(
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Detected language: " . $result->getLanguage() . "\n";
|
||||
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
|
||||
?>
|
||||
```
|
||||
24
docs/snippets/php/config/ocr_dpi_config.md
Normal file
24
docs/snippets/php/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\ImageExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
images: new ImageExtractionConfig(
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
maxImageDimension: 4096,
|
||||
autoAdjustDpi: true,
|
||||
minDpi: 150,
|
||||
maxDpi: 600
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Extracted images: " . count($result->getImages()) . "\n";
|
||||
?>
|
||||
```
|
||||
33
docs/snippets/php/config/pdf_config.md
Normal file
33
docs/snippets/php/config/pdf_config.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use function Kreuzberg\extract_file;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\PdfConfig;
|
||||
|
||||
/**
|
||||
* PDF configuration with hierarchy detection
|
||||
*/
|
||||
$config = new ExtractionConfig(
|
||||
pdf: new PdfConfig(
|
||||
extractImages: true,
|
||||
extractMetadata: true,
|
||||
passwords: ['password1', 'password2'],
|
||||
hierarchy: [
|
||||
'enabled' => true,
|
||||
'k_clusters' => 6,
|
||||
'include_bbox' => true,
|
||||
'ocr_coverage_threshold' => 0.5
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$result = extract_file('document.pdf', config: $config);
|
||||
|
||||
echo "Content length: " . strlen($result->content) . " characters\n";
|
||||
echo "Metadata: " . implode(', ', array_keys((array) $result->metadata)) . "\n";
|
||||
```
|
||||
26
docs/snippets/php/config/pdf_hierarchy_config.md
Normal file
26
docs/snippets/php/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PdfConfig;
|
||||
use Kreuzberg\HierarchyConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
pdfOptions: new PdfConfig(
|
||||
hierarchy: new HierarchyConfig(
|
||||
enabled: true,
|
||||
detectionThreshold: 0.75,
|
||||
ocrCoverageThreshold: 0.8,
|
||||
minLevel: 1,
|
||||
maxLevel: 5
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Hierarchy levels: " . count($result->getHierarchy()) . "\n";
|
||||
?>
|
||||
```
|
||||
23
docs/snippets/php/config/postprocessor_config.md
Normal file
23
docs/snippets/php/config/postprocessor_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\PostProcessorConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
postprocessor: new PostProcessorConfig(
|
||||
enabled: true,
|
||||
enabledProcessors: [
|
||||
'whitespace_normalizer',
|
||||
'unicode_normalizer'
|
||||
]
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Processed content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
20
docs/snippets/php/config/quality_processing_config.md
Normal file
20
docs/snippets/php/config/quality_processing_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
enableQualityProcessing: true,
|
||||
useCache: true
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Quality score: " . $result->getQualityScore() . "\n";
|
||||
if ($result->getProcessingTime()) {
|
||||
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
|
||||
}
|
||||
?>
|
||||
```
|
||||
25
docs/snippets/php/config/tesseract_config.md
Normal file
25
docs/snippets/php/config/tesseract_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\OcrConfig;
|
||||
use Kreuzberg\TesseractConfig;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng+deu',
|
||||
tesseractConfig: new TesseractConfig(
|
||||
psm: 6,
|
||||
oem: 3
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
|
||||
|
||||
echo "OCR text: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
21
docs/snippets/php/config/token_reduction_config.md
Normal file
21
docs/snippets/php/config/token_reduction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```php title="PHP"
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\ExtractionConfig;
|
||||
use Kreuzberg\TokenReductionOptions;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
tokenReduction: new TokenReductionOptions(
|
||||
mode: 'moderate',
|
||||
preserveImportantWords: true
|
||||
)
|
||||
);
|
||||
|
||||
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
|
||||
|
||||
echo "Original token count: " . $result->getTokenCount() . "\n";
|
||||
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
|
||||
?>
|
||||
```
|
||||
Reference in New Issue
Block a user