This commit is contained in:
313
docs/snippets/php/advanced/error_handling.php
Normal file
313
docs/snippets/php/advanced/error_handling.php
Normal file
@@ -0,0 +1,313 @@
|
||||
```php title="error_handling.php"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Error Handling
|
||||
*
|
||||
* Robust error handling for document extraction operations.
|
||||
* Handle failures gracefully and implement retry strategies.
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/vendor/autoload.php';
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Exceptions\KreuzbergException;
|
||||
use function Kreuzberg\extract_file;
|
||||
|
||||
try {
|
||||
$result = extract_file('document.pdf');
|
||||
echo "Extraction successful!\n";
|
||||
echo "Content length: " . strlen($result->content) . "\n";
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Error: " . $e->getMessage() . "\n";
|
||||
echo "Code: " . $e->getCode() . "\n";
|
||||
error_log("Kreuzberg extraction failed: " . $e->getMessage());
|
||||
}
|
||||
|
||||
function safeExtract(string $filePath): ?string
|
||||
{
|
||||
if (!file_exists($filePath)) {
|
||||
error_log("File not found: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!is_readable($filePath)) {
|
||||
error_log("File not readable: $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
$content = safeExtract('document.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted document\n";
|
||||
} else {
|
||||
echo "Failed to extract document\n";
|
||||
}
|
||||
|
||||
function extractWithRetry(
|
||||
string $filePath,
|
||||
int $maxRetries = 3,
|
||||
int $initialDelay = 1000
|
||||
): ?string {
|
||||
$attempt = 0;
|
||||
$delay = $initialDelay;
|
||||
|
||||
while ($attempt < $maxRetries) {
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
$attempt++;
|
||||
if ($attempt >= $maxRetries) {
|
||||
error_log("Max retries exceeded for $filePath: " . $e->getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
echo "Attempt $attempt failed, retrying in {$delay}ms...\n";
|
||||
usleep($delay * 1000);
|
||||
$delay *= 2;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithRetry('potentially_corrupt.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Document extracted after retry\n";
|
||||
}
|
||||
|
||||
function validateExtractionResult(string $filePath): bool
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
error_log("Empty content extracted from $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
$minExpectedChars = 100;
|
||||
if (strlen($result->content) < $minExpectedChars) {
|
||||
error_log("Content too short from $filePath: " . strlen($result->content) . " chars");
|
||||
return false;
|
||||
}
|
||||
|
||||
$nonPrintableRatio = (strlen($result->content) - strlen(preg_replace('/[^\x20-\x7E\x0A\x0D]/', '', $result->content))) / strlen($result->content);
|
||||
if ($nonPrintableRatio > 0.5) {
|
||||
error_log("High non-printable character ratio in $filePath");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Validation failed for $filePath: " . $e->getMessage());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (validateExtractionResult('document.pdf')) {
|
||||
echo "Extraction result validated successfully\n";
|
||||
} else {
|
||||
echo "Extraction result validation failed\n";
|
||||
}
|
||||
|
||||
$files = glob('documents/*.pdf');
|
||||
$successful = [];
|
||||
$failed = [];
|
||||
|
||||
foreach ($files as $file) {
|
||||
try {
|
||||
$result = extract_file($file);
|
||||
$successful[] = [
|
||||
'file' => $file,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
];
|
||||
} catch (KreuzbergException $e) {
|
||||
$failed[] = [
|
||||
'file' => $file,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo "\nBatch Processing Results:\n";
|
||||
echo str_repeat('=', 60) . "\n";
|
||||
echo "Successful: " . count($successful) . "\n";
|
||||
echo "Failed: " . count($failed) . "\n\n";
|
||||
|
||||
if (!empty($failed)) {
|
||||
echo "Failed files:\n";
|
||||
foreach ($failed as $failure) {
|
||||
echo " - {$failure['file']}: {$failure['error']}\n";
|
||||
}
|
||||
}
|
||||
|
||||
function extractWithFallback(string $filePath): ?string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
if (!empty($result->content)) {
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "Normal extraction failed, trying fallback strategies...\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new \Kreuzberg\Config\OcrConfig(
|
||||
backend: 'tesseract',
|
||||
language: 'eng'
|
||||
)
|
||||
);
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile($filePath);
|
||||
if (!empty($result->content)) {
|
||||
echo "Fallback: OCR extraction succeeded\n";
|
||||
return $result->content;
|
||||
}
|
||||
} catch (KreuzbergException $e) {
|
||||
echo "OCR fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
try {
|
||||
$content = file_get_contents($filePath);
|
||||
if (!empty($content)) {
|
||||
echo "Fallback: Reading as plain text\n";
|
||||
return $content;
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
echo "Plain text fallback failed: " . $e->getMessage() . "\n";
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
$content = extractWithFallback('problematic_file.pdf');
|
||||
if ($content !== null) {
|
||||
echo "Successfully extracted with fallback\n";
|
||||
}
|
||||
|
||||
function extractWithTimeout(string $filePath, int $timeoutSeconds = 30): ?string
|
||||
{
|
||||
$startTime = time();
|
||||
|
||||
try {
|
||||
set_time_limit($timeoutSeconds);
|
||||
|
||||
$result = extract_file($filePath);
|
||||
$elapsed = time() - $startTime;
|
||||
|
||||
if ($elapsed > $timeoutSeconds) {
|
||||
error_log("Extraction exceeded timeout for $filePath");
|
||||
return null;
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
error_log("Extraction error: " . $e->getMessage());
|
||||
return null;
|
||||
} finally {
|
||||
set_time_limit(0);
|
||||
}
|
||||
}
|
||||
|
||||
class DocumentExtractionException extends \Exception
|
||||
{
|
||||
public function __construct(
|
||||
string $message,
|
||||
public readonly string $filePath,
|
||||
public readonly ?string $mimeType = null,
|
||||
?\Throwable $previous = null
|
||||
) {
|
||||
parent::__construct($message, 0, $previous);
|
||||
}
|
||||
}
|
||||
|
||||
function extractOrThrow(string $filePath): string
|
||||
{
|
||||
try {
|
||||
$result = extract_file($filePath);
|
||||
|
||||
if (empty($result->content)) {
|
||||
throw new DocumentExtractionException(
|
||||
"No content extracted",
|
||||
$filePath,
|
||||
$result->mimeType
|
||||
);
|
||||
}
|
||||
|
||||
return $result->content;
|
||||
} catch (KreuzbergException $e) {
|
||||
throw new DocumentExtractionException(
|
||||
"Extraction failed: " . $e->getMessage(),
|
||||
$filePath,
|
||||
previous: $e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
$content = extractOrThrow('document.pdf');
|
||||
echo "Content: " . substr($content, 0, 100) . "...\n";
|
||||
} catch (DocumentExtractionException $e) {
|
||||
echo "Failed to extract {$e->filePath}\n";
|
||||
echo "Reason: {$e->getMessage()}\n";
|
||||
if ($e->mimeType) {
|
||||
echo "MIME type: {$e->mimeType}\n";
|
||||
}
|
||||
}
|
||||
|
||||
class LoggingKreuzberg
|
||||
{
|
||||
public function __construct(
|
||||
private Kreuzberg $kreuzberg,
|
||||
private \Psr\Log\LoggerInterface $logger
|
||||
) {}
|
||||
|
||||
public function extractFile(string $filePath, ?string $mimeType = null): ?\Kreuzberg\Types\ExtractionResult
|
||||
{
|
||||
$this->logger->info("Starting extraction", ['file' => $filePath]);
|
||||
$startTime = microtime(true);
|
||||
|
||||
try {
|
||||
$result = $this->kreuzberg->extractFile($filePath, $mimeType);
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->info("Extraction successful", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'content_length' => strlen($result->content),
|
||||
'tables' => count($result->tables),
|
||||
]);
|
||||
|
||||
return $result;
|
||||
} catch (KreuzbergException $e) {
|
||||
$elapsed = microtime(true) - $startTime;
|
||||
|
||||
$this->logger->error("Extraction failed", [
|
||||
'file' => $filePath,
|
||||
'duration' => $elapsed,
|
||||
'error' => $e->getMessage(),
|
||||
'code' => $e->getCode(),
|
||||
]);
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user