This commit is contained in:
77
docs/snippets/php/plugins/plugin_extractor.md
Normal file
77
docs/snippets/php/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,77 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class CustomXmlExtractor implements DocumentExtractor {
|
||||
public function name(): string {
|
||||
return "custom-xml-extractor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize XML parser resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function extractBytes(string $content, string $mimeType, object $config): object {
|
||||
try {
|
||||
$xml = simplexml_load_string($content);
|
||||
$text = $this->extractTextFromXml($xml);
|
||||
|
||||
return (object)[
|
||||
'content' => $text,
|
||||
'mime_type' => 'application/xml',
|
||||
'metadata' => [
|
||||
'root_element' => $xml->getName(),
|
||||
'extraction_method' => 'custom-xml-extractor'
|
||||
],
|
||||
'tables' => [],
|
||||
'detected_languages' => null,
|
||||
'chunks' => null,
|
||||
'images' => null,
|
||||
];
|
||||
} catch (Exception $e) {
|
||||
throw new Exception("XML parsing failed: " . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public function supportedMimeTypes(): array {
|
||||
return [
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
"application/xhtml+xml"
|
||||
];
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 75;
|
||||
}
|
||||
|
||||
private function extractTextFromXml($xml): string {
|
||||
$text = "";
|
||||
|
||||
// Extract text from all elements
|
||||
foreach ($xml->children() as $child) {
|
||||
$childText = (string)$child;
|
||||
if (!empty(trim($childText))) {
|
||||
$text .= trim($childText) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
return $text ?: (string)$xml;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the XML extractor
|
||||
$extractor = new CustomXmlExtractor();
|
||||
Kreuzberg::registerDocumentExtractor($extractor);
|
||||
|
||||
echo "XML extractor registered\n";
|
||||
```
|
||||
Reference in New Issue
Block a user