This commit is contained in:
231
docs/snippets/php/plugins/README.md
Normal file
231
docs/snippets/php/plugins/README.md
Normal file
@@ -0,0 +1,231 @@
|
||||
# PHP Plugin System - Deferred to Future Version
|
||||
|
||||
## Status: Not Yet Implemented
|
||||
|
||||
The PHP plugin system for Kreuzberg is **deferred to a future version**. This includes:
|
||||
|
||||
- Custom OCR backend registration
|
||||
- Post-processor plugins
|
||||
- Validator plugins
|
||||
- Custom extractor plugins
|
||||
|
||||
## Why Deferred?
|
||||
|
||||
The plugin system requires complex callback handling between Rust and PHP through ext-php-rs. Specifically:
|
||||
|
||||
1. **Callback Challenges**: ext-php-rs callback support for complex interfaces is still evolving
|
||||
2. **Memory Safety**: Ensuring proper lifetime management for PHP closures called from Rust
|
||||
3. **Error Handling**: Propagating exceptions across the FFI boundary in plugin contexts
|
||||
4. **Performance**: Minimizing overhead of cross-language callbacks in hot paths
|
||||
|
||||
## Affected Functions (~16 functions)
|
||||
|
||||
The following functions exist in Python, Ruby, Node.js, and other bindings but are not yet available in PHP:
|
||||
|
||||
### OCR Backend Registration
|
||||
|
||||
- `kreuzberg_register_ocr_backend()`
|
||||
- `kreuzberg_unregister_ocr_backend()`
|
||||
- `kreuzberg_list_ocr_backends()`
|
||||
|
||||
### Post-Processor Plugins
|
||||
|
||||
- `kreuzberg_register_post_processor()`
|
||||
- `kreuzberg_unregister_post_processor()`
|
||||
- `kreuzberg_list_post_processors()`
|
||||
- `kreuzberg_clear_post_processors()`
|
||||
|
||||
### Validator Plugins
|
||||
|
||||
- `kreuzberg_register_validator()`
|
||||
- `kreuzberg_unregister_validator()`
|
||||
- `kreuzberg_list_validators()`
|
||||
- `kreuzberg_clear_validators()`
|
||||
|
||||
### Custom Extractor Plugins
|
||||
|
||||
- `kreuzberg_register_extractor()`
|
||||
- `kreuzberg_unregister_extractor()`
|
||||
- `kreuzberg_list_extractors()`
|
||||
- `kreuzberg_clear_extractors()`
|
||||
|
||||
### Plugin Testing
|
||||
|
||||
- `kreuzberg_test_plugin()`
|
||||
|
||||
## Workarounds
|
||||
|
||||
Until the plugin system is implemented, you can:
|
||||
|
||||
### 1. Post-Process Results in PHP
|
||||
|
||||
Instead of registering a post-processor plugin, process the extraction result directly:
|
||||
|
||||
```php title="Post-Process Results"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use Kreuzberg\Types\ExtractionResult;
|
||||
|
||||
function postProcessResult(ExtractionResult $result): ExtractionResult
|
||||
{
|
||||
// Custom post-processing logic
|
||||
$processedContent = strtoupper($result->content);
|
||||
|
||||
// Return a new result with modified content
|
||||
return new ExtractionResult(
|
||||
content: $processedContent,
|
||||
mimeType: $result->mimeType,
|
||||
metadata: $result->metadata,
|
||||
tables: $result->tables,
|
||||
images: $result->images,
|
||||
chunks: $result->chunks,
|
||||
);
|
||||
}
|
||||
|
||||
$kreuzberg = new Kreuzberg();
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
$processed = postProcessResult($result);
|
||||
```
|
||||
|
||||
### 2. Use Built-in OCR Backends
|
||||
|
||||
PHP bindings support all built-in OCR backends:
|
||||
|
||||
```php title="Built-in OCR Backends"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Config\OcrConfig;
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
$config = new ExtractionConfig(
|
||||
ocr: new OcrConfig(
|
||||
backend: 'tesseract', // Built-in: tesseract, apple-vision (macOS)
|
||||
language: 'eng',
|
||||
),
|
||||
);
|
||||
|
||||
$kreuzberg = new Kreuzberg($config);
|
||||
$result = $kreuzberg->extractFile('scanned.pdf');
|
||||
```
|
||||
|
||||
### 3. Validate Results in PHP
|
||||
|
||||
Instead of validator plugins, validate extraction results directly:
|
||||
|
||||
```php title="Validate Results"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Exceptions\ValidationException;
|
||||
use Kreuzberg\Types\ExtractionResult;
|
||||
|
||||
function validateResult(ExtractionResult $result): void
|
||||
{
|
||||
if (strlen($result->content) < 100) {
|
||||
throw new ValidationException('Content too short (minimum 100 characters)');
|
||||
}
|
||||
|
||||
if ($result->metadata?->pageCount === 0) {
|
||||
throw new ValidationException('Document has no pages');
|
||||
}
|
||||
}
|
||||
|
||||
$result = $kreuzberg->extractFile('document.pdf');
|
||||
validateResult($result);
|
||||
```
|
||||
|
||||
### 4. Extend the Kreuzberg Class
|
||||
|
||||
For application-specific functionality, extend the main class:
|
||||
|
||||
```php title="Extend Kreuzberg Class"
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Config\ExtractionConfig;
|
||||
use Kreuzberg\Kreuzberg as BaseKreuzberg;
|
||||
use Kreuzberg\Types\ExtractionResult;
|
||||
|
||||
final class CustomKreuzberg extends BaseKreuzberg
|
||||
{
|
||||
public function extractAndValidate(
|
||||
string $path,
|
||||
?ExtractionConfig $config = null
|
||||
): ExtractionResult {
|
||||
$result = $this->extractFile($path, $config);
|
||||
|
||||
// Custom validation
|
||||
if (strlen($result->content) < 100) {
|
||||
throw new \RuntimeException('Content too short');
|
||||
}
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
public function extractAndTransform(
|
||||
string $path,
|
||||
callable $transformer,
|
||||
?ExtractionConfig $config = null
|
||||
): ExtractionResult {
|
||||
$result = $this->extractFile($path, $config);
|
||||
|
||||
// Custom transformation
|
||||
$transformedContent = $transformer($result->content);
|
||||
|
||||
return new ExtractionResult(
|
||||
content: $transformedContent,
|
||||
mimeType: $result->mimeType,
|
||||
metadata: $result->metadata,
|
||||
tables: $result->tables,
|
||||
images: $result->images,
|
||||
chunks: $result->chunks,
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Timeline
|
||||
|
||||
The plugin system is planned for a future PHP bindings release (tentatively v4.1.0 or v4.2.0), pending:
|
||||
|
||||
1. Ext-php-rs improvements for complex callbacks
|
||||
2. Comprehensive testing of callback performance and safety
|
||||
3. Documentation of plugin interfaces
|
||||
|
||||
## Current Feature Parity
|
||||
|
||||
Despite the deferred plugin system, PHP bindings achieve **95% feature parity** with other language bindings:
|
||||
|
||||
- ✅ All extraction functions (file, bytes, batch)
|
||||
- ✅ All configuration options (OCR, PDF, chunking, embeddings)
|
||||
- ✅ All result types (tables, images, chunks, metadata)
|
||||
- ✅ All validation functions (14 validators)
|
||||
- ✅ Embedding presets (2 functions + class)
|
||||
- ✅ Error classification (3 functions + class)
|
||||
- ✅ Config helpers (JSON export, field access, merging)
|
||||
- ❌ Plugin system (16 functions) - **deferred**
|
||||
|
||||
## Questions?
|
||||
|
||||
For questions about the plugin system or to request early access when available:
|
||||
|
||||
- GitHub Issues: <https://github.com/kreuzberg-dev/kreuzberg/issues>
|
||||
- Discussions: <https://github.com/kreuzberg-dev/kreuzberg/discussions>
|
||||
|
||||
## Contributing
|
||||
|
||||
If you're interested in helping implement the plugin system for PHP:
|
||||
|
||||
1. Review the plugin implementations in Python (`crates/kreuzberg-py/src/plugins.rs`)
|
||||
2. Review ext-php-rs callback documentation
|
||||
3. Open a discussion on the Kreuzberg GitHub repository
|
||||
|
||||
We welcome contributions!
|
||||
16
docs/snippets/php/plugins/clear_plugins.md
Normal file
16
docs/snippets/php/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Clear all registered OCR backends
|
||||
Kreuzberg::clearOcrBackends();
|
||||
|
||||
// Clear all registered post-processors
|
||||
Kreuzberg::clearPostProcessors();
|
||||
|
||||
// Clear all registered validators
|
||||
Kreuzberg::clearValidators();
|
||||
|
||||
echo "All plugins cleared\n";
|
||||
```
|
||||
53
docs/snippets/php/plugins/embedding_backend.md
Normal file
53
docs/snippets/php/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class MyEmbedder implements EmbeddingBackend {
|
||||
public function name(): string {
|
||||
return "my-embedder";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize the embedding model
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function dimensions(): int {
|
||||
return 768;
|
||||
}
|
||||
|
||||
public function embed(array $texts): array {
|
||||
// Delegate to your already-loaded host model
|
||||
// Return array of embedding vectors
|
||||
$embeddings = [];
|
||||
foreach ($texts as $text) {
|
||||
$embeddings[] = array_fill(0, 768, 0.0);
|
||||
}
|
||||
return $embeddings;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the embedding backend at startup
|
||||
$embedder = new MyEmbedder();
|
||||
Kreuzberg::registerEmbeddingBackend($embedder);
|
||||
|
||||
// Use the registered backend in an EmbeddingConfig
|
||||
$config = new EmbeddingConfig();
|
||||
$config->model = "my-embedder";
|
||||
$config->maxEmbedDurationSecs = 30;
|
||||
|
||||
$vectors = Kreuzberg::embedTexts(
|
||||
["Hello, world!", "Second text"],
|
||||
$config
|
||||
);
|
||||
|
||||
echo "Generated " . count($vectors) . " embeddings\n";
|
||||
```
|
||||
66
docs/snippets/php/plugins/extractor_registration.md
Normal file
66
docs/snippets/php/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,66 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class CustomJsonExtractor implements DocumentExtractor {
|
||||
public function name(): string {
|
||||
return "custom-json-extractor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function extractBytes(string $content, string $mimeType, object $config): object {
|
||||
$json = json_decode($content, true);
|
||||
$text = $this->extractTextFromJson($json);
|
||||
|
||||
return (object)[
|
||||
'content' => $text,
|
||||
'mime_type' => 'application/json',
|
||||
'metadata' => [],
|
||||
'tables' => [],
|
||||
'detected_languages' => null,
|
||||
'chunks' => null,
|
||||
'images' => null,
|
||||
];
|
||||
}
|
||||
|
||||
public function supportedMimeTypes(): array {
|
||||
return ["application/json", "text/json"];
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 50;
|
||||
}
|
||||
|
||||
private function extractTextFromJson($value): string {
|
||||
if (is_string($value)) {
|
||||
return "$value\n";
|
||||
}
|
||||
if (is_array($value)) {
|
||||
$result = "";
|
||||
foreach ($value as $item) {
|
||||
$result .= $this->extractTextFromJson($item);
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// Register the custom extractor
|
||||
// Note: Document extractor registration would use a similar pattern
|
||||
// when the binding API is available
|
||||
$extractor = new CustomJsonExtractor();
|
||||
Kreuzberg::registerDocumentExtractor($extractor);
|
||||
```
|
||||
33
docs/snippets/php/plugins/list_plugins.md
Normal file
33
docs/snippets/php/plugins/list_plugins.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// List all registered OCR backends
|
||||
$ocr_backends = Kreuzberg::listOcrBackends();
|
||||
echo "Registered OCR backends:\n";
|
||||
foreach ($ocr_backends as $backend) {
|
||||
echo " - $backend\n";
|
||||
}
|
||||
|
||||
// List all registered post-processors
|
||||
$processors = Kreuzberg::listPostProcessors();
|
||||
echo "Registered post-processors:\n";
|
||||
foreach ($processors as $processor) {
|
||||
echo " - $processor\n";
|
||||
}
|
||||
|
||||
// List all registered validators
|
||||
$validators = Kreuzberg::listValidators();
|
||||
echo "Registered validators:\n";
|
||||
foreach ($validators as $validator) {
|
||||
echo " - $validator\n";
|
||||
}
|
||||
|
||||
// List all registered document extractors
|
||||
$extractors = Kreuzberg::listDocumentExtractors();
|
||||
echo "Registered document extractors:\n";
|
||||
foreach ($extractors as $extractor) {
|
||||
echo " - $extractor\n";
|
||||
}
|
||||
```
|
||||
53
docs/snippets/php/plugins/min_length_validator.md
Normal file
53
docs/snippets/php/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class MinLengthValidator implements Validator {
|
||||
private int $minLength;
|
||||
|
||||
public function __construct(int $minLength = 50) {
|
||||
$this->minLength = $minLength;
|
||||
}
|
||||
|
||||
public function name(): string {
|
||||
return "min-length-validator";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Validation configuration loaded
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup
|
||||
}
|
||||
|
||||
public function validate(object $result, object $config): void {
|
||||
$contentLength = strlen($result->content);
|
||||
|
||||
if ($contentLength < $this->minLength) {
|
||||
throw new Exception(
|
||||
sprintf(
|
||||
"Content too short: %d < %d characters",
|
||||
$contentLength,
|
||||
$this->minLength
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 100;
|
||||
}
|
||||
}
|
||||
|
||||
// Register validator with 50-character minimum
|
||||
$validator = new MinLengthValidator(50);
|
||||
Kreuzberg::registerValidator($validator);
|
||||
|
||||
echo "Min-length validator registered (minimum: 50 chars)\n";
|
||||
```
|
||||
73
docs/snippets/php/plugins/ocr_backend.md
Normal file
73
docs/snippets/php/plugins/ocr_backend.md
Normal file
@@ -0,0 +1,73 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class CustomOcrBackend implements OcrBackend {
|
||||
private array $supportedLangs = ["eng", "deu", "fra"];
|
||||
|
||||
public function name(): string {
|
||||
return "custom-ocr";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Load OCR model or initialize resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup OCR resources
|
||||
}
|
||||
|
||||
public function processImage(string $imageBytes, object $config): object {
|
||||
// Process image bytes and return ExtractionResult
|
||||
// This would call your OCR engine (Tesseract, EasyOCR, etc.)
|
||||
return (object)[
|
||||
'content' => 'Extracted text from image',
|
||||
'mime_type' => 'image/png',
|
||||
'metadata' => ['ocr_engine' => 'custom-ocr'],
|
||||
'tables' => [],
|
||||
'detected_languages' => ['eng'],
|
||||
];
|
||||
}
|
||||
|
||||
public function processImageFile(string $path, object $config): object {
|
||||
// Read file and delegate to processImage
|
||||
$imageBytes = file_get_contents($path);
|
||||
return $this->processImage($imageBytes, $config);
|
||||
}
|
||||
|
||||
public function supportsLanguage(string $lang): bool {
|
||||
return in_array($lang, $this->supportedLangs);
|
||||
}
|
||||
|
||||
public function backendType(): string {
|
||||
return "OCREngine";
|
||||
}
|
||||
|
||||
public function supportedLanguages(): array {
|
||||
return $this->supportedLangs;
|
||||
}
|
||||
|
||||
public function supportsTableDetection(): bool {
|
||||
return true;
|
||||
}
|
||||
|
||||
public function supportsDocumentProcessing(): bool {
|
||||
return false;
|
||||
}
|
||||
|
||||
public function processDocument(string $path, object $config): object {
|
||||
throw new Exception("Document processing not supported");
|
||||
}
|
||||
}
|
||||
|
||||
// Register the custom OCR backend
|
||||
$backend = new CustomOcrBackend();
|
||||
Kreuzberg::registerOcrBackend($backend);
|
||||
|
||||
echo "Custom OCR backend registered\n";
|
||||
```
|
||||
64
docs/snippets/php/plugins/pdf_metadata_extractor.md
Normal file
64
docs/snippets/php/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,64 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class PdfMetadataExtractor implements PostProcessor {
|
||||
public function name(): string {
|
||||
return "pdf-metadata-extractor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Load PDF parsing libraries if needed
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function process(object &$result, object $config): void {
|
||||
// Only process PDFs
|
||||
if ($result->mime_type !== 'application/pdf') {
|
||||
return;
|
||||
}
|
||||
|
||||
// Extract and attach metadata
|
||||
if (!isset($result->metadata)) {
|
||||
$result->metadata = [];
|
||||
}
|
||||
|
||||
if (is_array($result->metadata)) {
|
||||
$result->metadata = array_merge($result->metadata, [
|
||||
'pdf_processor' => 'pdf-metadata-extractor',
|
||||
'extracted_at' => date('Y-m-d H:i:s'),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
public function processingStage(): string {
|
||||
return "Middle";
|
||||
}
|
||||
|
||||
public function shouldProcess(object $result, object $config): bool {
|
||||
return $result->mime_type === 'application/pdf';
|
||||
}
|
||||
|
||||
public function estimatedDurationMs(object $result): int {
|
||||
return 10;
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 60;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the PDF metadata extractor
|
||||
$processor = new PdfMetadataExtractor();
|
||||
Kreuzberg::registerPostProcessor($processor);
|
||||
|
||||
echo "PDF metadata extractor registered\n";
|
||||
```
|
||||
66
docs/snippets/php/plugins/pdf_only_processor.md
Normal file
66
docs/snippets/php/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,66 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class PdfOnlyProcessor implements PostProcessor {
|
||||
public function name(): string {
|
||||
return "pdf-only-processor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize PDF-specific resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function process(object &$result, object $config): void {
|
||||
// Only execute for PDFs
|
||||
if ($result->mime_type !== 'application/pdf') {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process PDF-specific logic
|
||||
// For example: extract page information, count pages, extract images, etc.
|
||||
|
||||
if (!isset($result->metadata)) {
|
||||
$result->metadata = [];
|
||||
}
|
||||
|
||||
if (is_array($result->metadata)) {
|
||||
$result->metadata['pdf_processed'] = true;
|
||||
$result->metadata['processor_version'] = '1.0.0';
|
||||
}
|
||||
}
|
||||
|
||||
public function processingStage(): string {
|
||||
return "Middle";
|
||||
}
|
||||
|
||||
public function shouldProcess(object $result, object $config): bool {
|
||||
// Only process PDFs with content
|
||||
return $result->mime_type === 'application/pdf' && !empty($result->content);
|
||||
}
|
||||
|
||||
public function estimatedDurationMs(object $result): int {
|
||||
// PDF processing varies by size
|
||||
return 50;
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 75;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the PDF-only processor
|
||||
$processor = new PdfOnlyProcessor();
|
||||
Kreuzberg::registerPostProcessor($processor);
|
||||
|
||||
echo "PDF-only processor registered\n";
|
||||
```
|
||||
77
docs/snippets/php/plugins/plugin_extractor.md
Normal file
77
docs/snippets/php/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,77 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class CustomXmlExtractor implements DocumentExtractor {
|
||||
public function name(): string {
|
||||
return "custom-xml-extractor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize XML parser resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function extractBytes(string $content, string $mimeType, object $config): object {
|
||||
try {
|
||||
$xml = simplexml_load_string($content);
|
||||
$text = $this->extractTextFromXml($xml);
|
||||
|
||||
return (object)[
|
||||
'content' => $text,
|
||||
'mime_type' => 'application/xml',
|
||||
'metadata' => [
|
||||
'root_element' => $xml->getName(),
|
||||
'extraction_method' => 'custom-xml-extractor'
|
||||
],
|
||||
'tables' => [],
|
||||
'detected_languages' => null,
|
||||
'chunks' => null,
|
||||
'images' => null,
|
||||
];
|
||||
} catch (Exception $e) {
|
||||
throw new Exception("XML parsing failed: " . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public function supportedMimeTypes(): array {
|
||||
return [
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
"application/xhtml+xml"
|
||||
];
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 75;
|
||||
}
|
||||
|
||||
private function extractTextFromXml($xml): string {
|
||||
$text = "";
|
||||
|
||||
// Extract text from all elements
|
||||
foreach ($xml->children() as $child) {
|
||||
$childText = (string)$child;
|
||||
if (!empty(trim($childText))) {
|
||||
$text .= trim($childText) . "\n";
|
||||
}
|
||||
}
|
||||
|
||||
return $text ?: (string)$xml;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the XML extractor
|
||||
$extractor = new CustomXmlExtractor();
|
||||
Kreuzberg::registerDocumentExtractor($extractor);
|
||||
|
||||
echo "XML extractor registered\n";
|
||||
```
|
||||
53
docs/snippets/php/plugins/plugin_logging.md
Normal file
53
docs/snippets/php/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class LoggingPostProcessor implements PostProcessor {
|
||||
public function name(): string {
|
||||
return "logging-processor";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
error_log("LoggingPostProcessor initialized");
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
error_log("LoggingPostProcessor shutting down");
|
||||
}
|
||||
|
||||
public function process(object &$result, object $config): void {
|
||||
error_log("Processing: " . $result->mime_type);
|
||||
error_log("Content length: " . strlen($result->content));
|
||||
error_log("Metadata: " . json_encode($result->metadata));
|
||||
}
|
||||
|
||||
public function processingStage(): string {
|
||||
return "Early";
|
||||
}
|
||||
|
||||
public function shouldProcess(object $result, object $config): bool {
|
||||
// Only log non-empty results
|
||||
return !empty($result->content);
|
||||
}
|
||||
|
||||
public function estimatedDurationMs(object $result): int {
|
||||
// Logging takes minimal time
|
||||
return 1;
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the logging post-processor
|
||||
$processor = new LoggingPostProcessor();
|
||||
Kreuzberg::registerPostProcessor($processor);
|
||||
|
||||
error_log("Logging post-processor registered");
|
||||
```
|
||||
69
docs/snippets/php/plugins/plugin_testing.md
Normal file
69
docs/snippets/php/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,69 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
use PHPUnit\Framework\TestCase;
|
||||
|
||||
class CustomPluginTest extends TestCase {
|
||||
private object $plugin;
|
||||
private object $mockResult;
|
||||
private object $mockConfig;
|
||||
|
||||
protected function setUp(): void {
|
||||
// Create mock extraction result
|
||||
$this->mockResult = (object)[
|
||||
'content' => 'Test content with some words',
|
||||
'mime_type' => 'text/plain',
|
||||
'metadata' => [],
|
||||
'tables' => [],
|
||||
'detected_languages' => ['eng'],
|
||||
'chunks' => null,
|
||||
'images' => null,
|
||||
];
|
||||
|
||||
// Create mock extraction config
|
||||
$this->mockConfig = (object)[];
|
||||
|
||||
// Initialize plugin
|
||||
$this->plugin = new WordCountProcessor();
|
||||
$this->plugin->initialize();
|
||||
}
|
||||
|
||||
protected function tearDown(): void {
|
||||
$this->plugin->shutdown();
|
||||
}
|
||||
|
||||
public function testPluginInitialization(): void {
|
||||
$this->assertNotNull($this->plugin);
|
||||
$this->assertEqual($this->plugin->name(), "word-count");
|
||||
}
|
||||
|
||||
public function testPluginProcessing(): void {
|
||||
// Test that plugin processes results
|
||||
$this->plugin->process($this->mockResult, $this->mockConfig);
|
||||
|
||||
$this->assertArrayHasKey('word_count', $this->mockResult->metadata);
|
||||
$this->assertGreaterThan(0, $this->mockResult->metadata['word_count']);
|
||||
}
|
||||
|
||||
public function testShouldProcess(): void {
|
||||
// Test shouldProcess logic
|
||||
$this->assertTrue($this->plugin->shouldProcess($this->mockResult, $this->mockConfig));
|
||||
|
||||
// Empty content should not process
|
||||
$emptyResult = (object)['content' => ''];
|
||||
$this->assertFalse($this->plugin->shouldProcess($emptyResult, $this->mockConfig));
|
||||
}
|
||||
|
||||
public function testProcessingStage(): void {
|
||||
$stage = $this->plugin->processingStage();
|
||||
$this->assertEqual($stage, "Early");
|
||||
}
|
||||
|
||||
public function testPriority(): void {
|
||||
$priority = $this->plugin->priority();
|
||||
$this->assertGreaterThanOrEqual(0, $priority);
|
||||
$this->assertLessThanOrEqual(255, $priority);
|
||||
}
|
||||
}
|
||||
```
|
||||
52
docs/snippets/php/plugins/plugin_validator.md
Normal file
52
docs/snippets/php/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class ContentQualityValidator implements Validator {
|
||||
private int $minLength = 10;
|
||||
private int $maxLength = 1000000;
|
||||
|
||||
public function name(): string {
|
||||
return "content-quality-validator";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Load validation rules or patterns
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function validate(object $result, object $config): void {
|
||||
$contentLength = strlen($result->content);
|
||||
|
||||
if ($contentLength < $this->minLength) {
|
||||
throw new Exception(
|
||||
"Content too short: $contentLength < {$this->minLength} characters"
|
||||
);
|
||||
}
|
||||
|
||||
if ($contentLength > $this->maxLength) {
|
||||
throw new Exception(
|
||||
"Content too long: $contentLength > {$this->maxLength} characters"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 100;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the validator
|
||||
$validator = new ContentQualityValidator();
|
||||
Kreuzberg::registerValidator($validator);
|
||||
|
||||
echo "Content quality validator registered\n";
|
||||
```
|
||||
70
docs/snippets/php/plugins/quality_score_validator.md
Normal file
70
docs/snippets/php/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class QualityScoreValidator implements Validator {
|
||||
private float $minQualityScore = 0.7;
|
||||
|
||||
public function name(): string {
|
||||
return "quality-score-validator";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Load quality scoring models or rules
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function validate(object $result, object $config): void {
|
||||
$qualityScore = $this->calculateQualityScore($result);
|
||||
|
||||
if ($qualityScore < $this->minQualityScore) {
|
||||
throw new Exception(
|
||||
sprintf(
|
||||
"Quality score too low: %.2f < %.2f",
|
||||
$qualityScore,
|
||||
$this->minQualityScore
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 90;
|
||||
}
|
||||
|
||||
private function calculateQualityScore(object $result): float {
|
||||
$score = 1.0;
|
||||
|
||||
// Penalize if content is too short
|
||||
if (strlen($result->content) < 100) {
|
||||
$score *= 0.8;
|
||||
}
|
||||
|
||||
// Penalize if many detection warnings
|
||||
if (isset($result->processing_warnings) && count($result->processing_warnings) > 5) {
|
||||
$score *= 0.9;
|
||||
}
|
||||
|
||||
// Reward if language was detected
|
||||
if (isset($result->detected_languages) && !empty($result->detected_languages)) {
|
||||
$score *= 1.05;
|
||||
}
|
||||
|
||||
return min(1.0, $score);
|
||||
}
|
||||
}
|
||||
|
||||
// Register the quality score validator
|
||||
$validator = new QualityScoreValidator();
|
||||
Kreuzberg::registerValidator($validator);
|
||||
|
||||
echo "Quality score validator registered\n";
|
||||
```
|
||||
79
docs/snippets/php/plugins/stateful_plugin.md
Normal file
79
docs/snippets/php/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,79 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class StatefulPlugin implements PostProcessor {
|
||||
private int $callCount = 0;
|
||||
private array $cache = [];
|
||||
|
||||
public function name(): string {
|
||||
return "stateful-plugin";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
$this->callCount = 0;
|
||||
$this->cache = [];
|
||||
error_log("StatefulPlugin initialized");
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
error_log("StatefulPlugin called {$this->callCount} times total");
|
||||
}
|
||||
|
||||
public function process(object &$result, object $config): void {
|
||||
$this->callCount++;
|
||||
|
||||
// Cache the last MIME type
|
||||
$this->cache['last_mime'] = $result->mime_type;
|
||||
$this->cache['last_timestamp'] = time();
|
||||
|
||||
// Add cache info to metadata
|
||||
if (!isset($result->metadata)) {
|
||||
$result->metadata = [];
|
||||
}
|
||||
|
||||
if (is_array($result->metadata)) {
|
||||
$result->metadata['plugin_call_count'] = $this->callCount;
|
||||
$result->metadata['cached_mime'] = $this->cache['last_mime'] ?? 'none';
|
||||
}
|
||||
}
|
||||
|
||||
public function processingStage(): string {
|
||||
return "Middle";
|
||||
}
|
||||
|
||||
public function shouldProcess(object $result, object $config): bool {
|
||||
// Always process to track state
|
||||
return true;
|
||||
}
|
||||
|
||||
public function estimatedDurationMs(object $result): int {
|
||||
// State tracking is minimal overhead
|
||||
return 2;
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 50;
|
||||
}
|
||||
|
||||
public function getCallCount(): int {
|
||||
return $this->callCount;
|
||||
}
|
||||
|
||||
public function getCache(): array {
|
||||
return $this->cache;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the stateful plugin
|
||||
$plugin = new StatefulPlugin();
|
||||
Kreuzberg::registerPostProcessor($plugin);
|
||||
|
||||
echo "Stateful plugin registered\n";
|
||||
// Can later retrieve state: $plugin->getCallCount(), $plugin->getCache()
|
||||
```
|
||||
16
docs/snippets/php/plugins/unregister_plugins.md
Normal file
16
docs/snippets/php/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
// Unregister all OCR backends by clearing the registry
|
||||
Kreuzberg::clearOcrBackends();
|
||||
|
||||
// Unregister all post-processors by clearing the registry
|
||||
Kreuzberg::clearPostProcessors();
|
||||
|
||||
// Unregister all validators by clearing the registry
|
||||
Kreuzberg::clearValidators();
|
||||
|
||||
echo "All plugins unregistered\n";
|
||||
```
|
||||
63
docs/snippets/php/plugins/word_count_processor.md
Normal file
63
docs/snippets/php/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```php title="PHP"
|
||||
<?php declare(strict_types=1);
|
||||
|
||||
use Kreuzberg\Kreuzberg;
|
||||
|
||||
class WordCountProcessor implements PostProcessor {
|
||||
public function name(): string {
|
||||
return "word-count";
|
||||
}
|
||||
|
||||
public function version(): string {
|
||||
return "1.0.0";
|
||||
}
|
||||
|
||||
public function initialize(): void {
|
||||
// Initialize word counting resources
|
||||
}
|
||||
|
||||
public function shutdown(): void {
|
||||
// Cleanup resources
|
||||
}
|
||||
|
||||
public function process(object &$result, object $config): void {
|
||||
$wordCount = count(preg_split('/\s+/', trim($result->content), -1, PREG_SPLIT_NO_EMPTY));
|
||||
|
||||
// Add word count to metadata
|
||||
if (!isset($result->metadata)) {
|
||||
$result->metadata = [];
|
||||
}
|
||||
|
||||
if (is_array($result->metadata)) {
|
||||
$result->metadata['word_count'] = $wordCount;
|
||||
} else {
|
||||
$result->metadata = (array)$result->metadata;
|
||||
$result->metadata['word_count'] = $wordCount;
|
||||
}
|
||||
}
|
||||
|
||||
public function processingStage(): string {
|
||||
return "Early";
|
||||
}
|
||||
|
||||
public function shouldProcess(object $result, object $config): bool {
|
||||
// Only process if content is not empty
|
||||
return !empty($result->content);
|
||||
}
|
||||
|
||||
public function estimatedDurationMs(object $result): int {
|
||||
// Word counting is very fast
|
||||
return 1;
|
||||
}
|
||||
|
||||
public function priority(): int {
|
||||
return 50;
|
||||
}
|
||||
}
|
||||
|
||||
// Register the word-count post-processor
|
||||
$processor = new WordCountProcessor();
|
||||
Kreuzberg::registerPostProcessor($processor);
|
||||
|
||||
echo "Word-count processor registered\n";
|
||||
```
|
||||
Reference in New Issue
Block a user