Files
fil/docs/snippets/php/plugins/plugin_extractor.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.0 KiB

<?php declare(strict_types=1);

use Kreuzberg\Kreuzberg;

class CustomXmlExtractor implements DocumentExtractor {
    public function name(): string {
        return "custom-xml-extractor";
    }

    public function version(): string {
        return "1.0.0";
    }

    public function initialize(): void {
        // Initialize XML parser resources
    }

    public function shutdown(): void {
        // Cleanup resources
    }

    public function extractBytes(string $content, string $mimeType, object $config): object {
        try {
            $xml = simplexml_load_string($content);
            $text = $this->extractTextFromXml($xml);

            return (object)[
                'content' => $text,
                'mime_type' => 'application/xml',
                'metadata' => [
                    'root_element' => $xml->getName(),
                    'extraction_method' => 'custom-xml-extractor'
                ],
                'tables' => [],
                'detected_languages' => null,
                'chunks' => null,
                'images' => null,
            ];
        } catch (Exception $e) {
            throw new Exception("XML parsing failed: " . $e->getMessage());
        }
    }

    public function supportedMimeTypes(): array {
        return [
            "application/xml",
            "text/xml",
            "application/xhtml+xml"
        ];
    }

    public function priority(): int {
        return 75;
    }

    private function extractTextFromXml($xml): string {
        $text = "";

        // Extract text from all elements
        foreach ($xml->children() as $child) {
            $childText = (string)$child;
            if (!empty(trim($childText))) {
                $text .= trim($childText) . "\n";
            }
        }

        return $text ?: (string)$xml;
    }
}

// Register the XML extractor
$extractor = new CustomXmlExtractor();
Kreuzberg::registerDocumentExtractor($extractor);

echo "XML extractor registered\n";