198 lines
5.4 KiB
PHP
198 lines
5.4 KiB
PHP
|
|
```php title="language_detection.php"
|
||
|
|
<?php
|
||
|
|
|
||
|
|
declare(strict_types=1);
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Language Detection
|
||
|
|
*
|
||
|
|
* Automatically detect the language of extracted document content.
|
||
|
|
* Useful for routing documents to language-specific processing pipelines.
|
||
|
|
*/
|
||
|
|
|
||
|
|
require_once __DIR__ . '/vendor/autoload.php';
|
||
|
|
|
||
|
|
use Kreuzberg\Kreuzberg;
|
||
|
|
use Kreuzberg\Config\ExtractionConfig;
|
||
|
|
use Kreuzberg\Config\LanguageDetectionConfig;
|
||
|
|
|
||
|
|
$config = new ExtractionConfig(
|
||
|
|
languageDetection: new LanguageDetectionConfig(
|
||
|
|
enabled: true,
|
||
|
|
minConfidence: 0.9,
|
||
|
|
detectMultiple: true
|
||
|
|
)
|
||
|
|
);
|
||
|
|
|
||
|
|
$kreuzberg = new Kreuzberg($config);
|
||
|
|
$result = $kreuzberg->extractFile('document.pdf');
|
||
|
|
|
||
|
|
echo "Language Detection Results:\n";
|
||
|
|
echo str_repeat('=', 60) . "\n";
|
||
|
|
echo "Document: document.pdf\n";
|
||
|
|
echo "Content length: " . strlen($result->content) . " characters\n\n";
|
||
|
|
|
||
|
|
$detectedLanguages = $result->detectedLanguages ?? [];
|
||
|
|
|
||
|
|
if (!empty($detectedLanguages)) {
|
||
|
|
echo "Detected languages: " . implode(', ', $detectedLanguages) . "\n";
|
||
|
|
|
||
|
|
$primaryLanguage = $detectedLanguages[0];
|
||
|
|
echo "Primary language: $primaryLanguage\n\n";
|
||
|
|
|
||
|
|
if (isset($result->metadata['language_confidence'])) {
|
||
|
|
echo "Language confidence scores:\n";
|
||
|
|
foreach ($result->metadata['language_confidence'] as $lang => $confidence) {
|
||
|
|
echo sprintf(" %-10s: %.1f%%\n", $lang, $confidence * 100);
|
||
|
|
}
|
||
|
|
echo "\n";
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
echo "No language detected or confidence too low.\n";
|
||
|
|
echo "Try lowering minConfidence threshold.\n\n";
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!empty($detectedLanguages)) {
|
||
|
|
$primaryLanguage = $detectedLanguages[0];
|
||
|
|
|
||
|
|
match ($primaryLanguage) {
|
||
|
|
'en', 'eng' => print("Processing as English document...\n"),
|
||
|
|
'es', 'spa' => print("Processing as Spanish document...\n"),
|
||
|
|
'fr', 'fra' => print("Processing as French document...\n"),
|
||
|
|
'de', 'deu' => print("Processing as German document...\n"),
|
||
|
|
'zh', 'zho' => print("Processing as Chinese document...\n"),
|
||
|
|
default => print("Processing as $primaryLanguage document...\n"),
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
echo "\n" . str_repeat('=', 60) . "\n";
|
||
|
|
echo "Testing Different Confidence Thresholds:\n";
|
||
|
|
echo str_repeat('=', 60) . "\n";
|
||
|
|
|
||
|
|
$thresholds = [0.5, 0.7, 0.9, 0.95];
|
||
|
|
|
||
|
|
foreach ($thresholds as $threshold) {
|
||
|
|
$thresholdConfig = new ExtractionConfig(
|
||
|
|
languageDetection: new LanguageDetectionConfig(
|
||
|
|
enabled: true,
|
||
|
|
minConfidence: $threshold,
|
||
|
|
detectMultiple: true
|
||
|
|
)
|
||
|
|
);
|
||
|
|
|
||
|
|
$kreuzberg = new Kreuzberg($thresholdConfig);
|
||
|
|
$result = $kreuzberg->extractFile('document.pdf');
|
||
|
|
|
||
|
|
$languages = $result->detectedLanguages ?? [];
|
||
|
|
|
||
|
|
echo sprintf("Threshold %.2f: ", $threshold);
|
||
|
|
if (!empty($languages)) {
|
||
|
|
echo implode(', ', $languages) . "\n";
|
||
|
|
} else {
|
||
|
|
echo "No languages detected\n";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function getLanguageName(string $code): string
|
||
|
|
{
|
||
|
|
$languageNames = [
|
||
|
|
'en' => 'English',
|
||
|
|
'es' => 'Spanish',
|
||
|
|
'fr' => 'French',
|
||
|
|
'de' => 'German',
|
||
|
|
'it' => 'Italian',
|
||
|
|
'pt' => 'Portuguese',
|
||
|
|
'ru' => 'Russian',
|
||
|
|
'zh' => 'Chinese',
|
||
|
|
'ja' => 'Japanese',
|
||
|
|
'ko' => 'Korean',
|
||
|
|
'ar' => 'Arabic',
|
||
|
|
'hi' => 'Hindi',
|
||
|
|
'nl' => 'Dutch',
|
||
|
|
'pl' => 'Polish',
|
||
|
|
'tr' => 'Turkish',
|
||
|
|
];
|
||
|
|
|
||
|
|
return $languageNames[$code] ?? ucfirst($code);
|
||
|
|
}
|
||
|
|
|
||
|
|
echo "\n" . str_repeat('=', 60) . "\n";
|
||
|
|
echo "Detected Languages (Full Names):\n";
|
||
|
|
echo str_repeat('=', 60) . "\n";
|
||
|
|
|
||
|
|
if (!empty($detectedLanguages)) {
|
||
|
|
foreach ($detectedLanguages as $langCode) {
|
||
|
|
echo " - " . getLanguageName($langCode) . " ($langCode)\n";
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
echo "No languages detected.\n";
|
||
|
|
}
|
||
|
|
|
||
|
|
$documents = [
|
||
|
|
'english_doc.pdf',
|
||
|
|
'spanish_doc.pdf',
|
||
|
|
'german_doc.pdf',
|
||
|
|
];
|
||
|
|
|
||
|
|
echo "\n" . str_repeat('=', 60) . "\n";
|
||
|
|
echo "Batch Language Detection:\n";
|
||
|
|
echo str_repeat('=', 60) . "\n";
|
||
|
|
|
||
|
|
$detectionConfig = new ExtractionConfig(
|
||
|
|
languageDetection: new LanguageDetectionConfig(
|
||
|
|
enabled: true,
|
||
|
|
minConfidence: 0.8,
|
||
|
|
detectMultiple: false
|
||
|
|
)
|
||
|
|
);
|
||
|
|
|
||
|
|
$kreuzberg = new Kreuzberg($detectionConfig);
|
||
|
|
|
||
|
|
foreach ($documents as $document) {
|
||
|
|
if (!file_exists($document)) {
|
||
|
|
echo basename($document) . ": File not found\n";
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
$result = $kreuzberg->extractFile($document);
|
||
|
|
$languages = $result->detectedLanguages ?? [];
|
||
|
|
|
||
|
|
echo basename($document) . ": ";
|
||
|
|
|
||
|
|
if (!empty($languages)) {
|
||
|
|
$primaryLang = $languages[0];
|
||
|
|
echo getLanguageName($primaryLang) . " ($primaryLang)\n";
|
||
|
|
} else {
|
||
|
|
echo "Language not detected\n";
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function routeDocumentByLanguage(string $filePath, array $detectedLanguages): string
|
||
|
|
{
|
||
|
|
if (empty($detectedLanguages)) {
|
||
|
|
return 'default_queue';
|
||
|
|
}
|
||
|
|
|
||
|
|
$primaryLanguage = $detectedLanguages[0];
|
||
|
|
|
||
|
|
return match ($primaryLanguage) {
|
||
|
|
'en', 'eng' => 'english_processing_queue',
|
||
|
|
'es', 'spa' => 'spanish_processing_queue',
|
||
|
|
'fr', 'fra' => 'french_processing_queue',
|
||
|
|
'de', 'deu' => 'german_processing_queue',
|
||
|
|
'zh', 'zho', 'ja', 'jpn', 'ko', 'kor' => 'cjk_processing_queue',
|
||
|
|
'ar', 'ara', 'he', 'heb' => 'rtl_processing_queue',
|
||
|
|
default => 'multilingual_queue',
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
echo "\n" . str_repeat('=', 60) . "\n";
|
||
|
|
echo "Document Routing Based on Language:\n";
|
||
|
|
echo str_repeat('=', 60) . "\n";
|
||
|
|
|
||
|
|
if (!empty($detectedLanguages)) {
|
||
|
|
$queue = routeDocumentByLanguage('document.pdf', $detectedLanguages);
|
||
|
|
echo "Document routed to: $queue\n";
|
||
|
|
}
|
||
|
|
```
|