Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,47 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\OcrConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\LanguageDetectionConfig;
use Kreuzberg\TokenReductionOptions;
use Kreuzberg\PostProcessorConfig;
use Kreuzberg\EmbeddingConfig;
// Advanced configuration combining multiple features
$config = new ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
),
chunking: new ChunkingConfig(
maxCharacters: 1000,
overlap: 200
),
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.8,
detectMultiple: false
),
tokenReduction: new TokenReductionOptions(
mode: 'moderate',
preserveImportantWords: true
),
postprocessor: new PostProcessorConfig(
enabled: true
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Content length: " . strlen($result->getContent()) . " characters\n";
if ($result->getDetectedLanguages()) {
echo "Languages: " . implode(', ', $result->getDetectedLanguages()) . "\n";
}
?>
```

View File

@@ -0,0 +1,56 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
// Basic chunking
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 1000,
overlap: 200
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Number of chunks: " . count($result->getChunks()) . "\n";
foreach ($result->getChunks() as $chunk) {
echo "Chunk size: " . strlen($chunk->getContent()) . " characters\n";
}
?>
```
```php title="PHP - Markdown with Heading Context"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 500,
overlap: 50,
chunkerType: 'markdown',
prependHeadingContext: true
)
);
$result = Kreuzberg::extractFileSync('document.md', null, $config);
foreach ($result->getChunks() as $chunk) {
$metadata = $chunk->getMetadata();
if ($metadata && $metadata->getHeadingContext()) {
$headings = $metadata->getHeadingContext()->getHeadings();
foreach ($headings as $heading) {
echo "Heading L" . $heading->getLevel() . ": " . $heading->getText() . "\n";
}
}
echo "Content: " . substr($chunk->getContent(), 0, 100) . "...\n";
}
?>
```

View File

@@ -0,0 +1,17 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$config = new ExtractionConfig(
useCache: true,
enableQualityProcessing: true
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo $result->getContent();
?>
```

View File

@@ -0,0 +1,13 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
// Discover configuration from file system
$config = ExtractionConfig::discover() ?? new ExtractionConfig();
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo $result->getContent();
?>
```

View File

@@ -0,0 +1,21 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\OcrConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng'
)
);
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
echo "Content length: " . strlen($result->getContent()) . " characters\n";
echo "Tables detected: " . count($result->getTables()) . "\n";
?>
```

View File

@@ -0,0 +1,29 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\OcrConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\TesseractConfig;
$config = new ExtractionConfig(
useCache: true,
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+deu',
tesseractConfig: new TesseractConfig(psm: 6)
),
chunking: new ChunkingConfig(
maxCharacters: 1000,
overlap: 200
),
enableQualityProcessing: true
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Content length: " . strlen($result->getContent()) . " characters\n";
?>
```

View File

@@ -0,0 +1,16 @@
```php title="Document Structure Config (PHP)"
<?php
use Kreuzberg\ExtractionConfig;
use Kreuzberg\Kreuzberg;
$config = new ExtractionConfig(includeDocumentStructure: true);
$result = Kreuzberg::extractFileSync('document.pdf', $config);
if ($result->document !== null) {
foreach ($result->document->nodes as $node) {
echo "[{$node->content->nodeType}]\n";
}
}
?>
```

View File

@@ -0,0 +1,42 @@
```php title="Element-Based Output (PHP)"
<?php
use Kreuzberg\ExtractionConfig;
use Kreuzberg\Kreuzberg;
// Configure element-based output
$config = new ExtractionConfig();
$config->setOutputFormat('element_based');
// Extract document
$result = Kreuzberg::extractFileSync('document.pdf', $config);
// Access elements
foreach ($result->getElements() as $element) {
echo "Type: " . $element->getElementType() . "\n";
echo "Text: " . substr($element->getText(), 0, 100) . "\n";
if ($element->getMetadata()->getPageNumber()) {
echo "Page: " . $element->getMetadata()->getPageNumber() . "\n";
}
if ($element->getMetadata()->getCoordinates()) {
$coords = $element->getMetadata()->getCoordinates();
echo sprintf("Coords: (%s, %s) - (%s, %s)\n",
$coords->getLeft(), $coords->getTop(),
$coords->getRight(), $coords->getBottom());
}
echo "---\n";
}
// Filter by element type
$titles = array_filter($result->getElements(), function($e) {
return $e->getElementType() === 'title';
});
foreach ($titles as $title) {
$level = $title->getMetadata()->getAdditional()['level'] ?? 'unknown';
echo "[{$level}] {$title->getText()}\n";
}
?>
```

View File

@@ -0,0 +1,27 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ChunkingConfig;
use Kreuzberg\EmbeddingConfig;
$config = new ExtractionConfig(
chunking: new ChunkingConfig(
maxCharacters: 1000,
overlap: 200,
embedding: new EmbeddingConfig(
model: 'balanced',
batchSize: 16,
normalize: true,
showDownloadProgress: true
)
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Chunks with embeddings: " . count($result->getChunks()) . "\n";
?>
```

View File

@@ -0,0 +1,21 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\HtmlOutputConfig;
$config = new ExtractionConfig(
resultFormat: 'html',
htmlOutput: new HtmlOutputConfig(
theme: 'github'
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
// Output HTML with kb-* CSS classes
echo $result->getContent();
?>
```

View File

@@ -0,0 +1,26 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\KeywordConfig;
$config = new ExtractionConfig(
keywords: new KeywordConfig(
algorithm: 'yake',
maxKeywords: 10,
minScore: 0.1,
language: 'en'
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
if ($result->getKeywords()) {
foreach ($result->getKeywords() as $keyword) {
echo $keyword . "\n";
}
}
?>
```

View File

@@ -0,0 +1,22 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\LanguageDetectionConfig;
$config = new ExtractionConfig(
languageDetection: new LanguageDetectionConfig(
enabled: true,
minConfidence: 0.8,
detectMultiple: true
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Detected language: " . $result->getLanguage() . "\n";
echo "Confidence: " . $result->getLanguageConfidence() . "\n";
?>
```

View File

@@ -0,0 +1,24 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\ImageExtractionConfig;
$config = new ExtractionConfig(
images: new ImageExtractionConfig(
extractImages: true,
targetDpi: 300,
maxImageDimension: 4096,
autoAdjustDpi: true,
minDpi: 150,
maxDpi: 600
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Extracted images: " . count($result->getImages()) . "\n";
?>
```

View File

@@ -0,0 +1,33 @@
```php title="PHP"
<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use function Kreuzberg\extract_file;
use Kreuzberg\Config\ExtractionConfig;
use Kreuzberg\Config\PdfConfig;
/**
* PDF configuration with hierarchy detection
*/
$config = new ExtractionConfig(
pdf: new PdfConfig(
extractImages: true,
extractMetadata: true,
passwords: ['password1', 'password2'],
hierarchy: [
'enabled' => true,
'k_clusters' => 6,
'include_bbox' => true,
'ocr_coverage_threshold' => 0.5
]
)
);
$result = extract_file('document.pdf', config: $config);
echo "Content length: " . strlen($result->content) . " characters\n";
echo "Metadata: " . implode(', ', array_keys((array) $result->metadata)) . "\n";
```

View File

@@ -0,0 +1,26 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\PdfConfig;
use Kreuzberg\HierarchyConfig;
$config = new ExtractionConfig(
pdfOptions: new PdfConfig(
hierarchy: new HierarchyConfig(
enabled: true,
detectionThreshold: 0.75,
ocrCoverageThreshold: 0.8,
minLevel: 1,
maxLevel: 5
)
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Hierarchy levels: " . count($result->getHierarchy()) . "\n";
?>
```

View File

@@ -0,0 +1,23 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\PostProcessorConfig;
$config = new ExtractionConfig(
postprocessor: new PostProcessorConfig(
enabled: true,
enabledProcessors: [
'whitespace_normalizer',
'unicode_normalizer'
]
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Processed content: " . substr($result->getContent(), 0, 100) . "...\n";
?>
```

View File

@@ -0,0 +1,20 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
$config = new ExtractionConfig(
enableQualityProcessing: true,
useCache: true
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Quality score: " . $result->getQualityScore() . "\n";
if ($result->getProcessingTime()) {
echo "Processing time: " . $result->getProcessingTime() . "ms\n";
}
?>
```

View File

@@ -0,0 +1,25 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\OcrConfig;
use Kreuzberg\TesseractConfig;
$config = new ExtractionConfig(
ocr: new OcrConfig(
backend: 'tesseract',
language: 'eng+deu',
tesseractConfig: new TesseractConfig(
psm: 6,
oem: 3
)
)
);
$result = Kreuzberg::extractFileSync('scanned.pdf', null, $config);
echo "OCR text: " . substr($result->getContent(), 0, 100) . "...\n";
?>
```

View File

@@ -0,0 +1,21 @@
```php title="PHP"
<?php
declare(strict_types=1);
use Kreuzberg\Kreuzberg;
use Kreuzberg\ExtractionConfig;
use Kreuzberg\TokenReductionOptions;
$config = new ExtractionConfig(
tokenReduction: new TokenReductionOptions(
mode: 'moderate',
preserveImportantWords: true
)
);
$result = Kreuzberg::extractFileSync('document.pdf', null, $config);
echo "Original token count: " . $result->getTokenCount() . "\n";
echo "Reduced content: " . substr($result->getContent(), 0, 100) . "...\n";
?>
```