This commit is contained in:
64
packages/php/src/DocumentExtractor.php
Normal file
64
packages/php/src/DocumentExtractor.php
Normal file
@@ -0,0 +1,64 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for DocumentExtractor.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface DocumentExtractor
|
||||
{
|
||||
|
||||
/**
|
||||
* Extract content from a byte array.
|
||||
*
|
||||
|
||||
* @param mixed $content
|
||||
* @param string $mime_type
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function extract_bytes(mixed $content, string $mime_type, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Extract content from a file.
|
||||
*
|
||||
|
||||
* @param mixed $path
|
||||
* @param string $mime_type
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function extract_file(mixed $path, string $mime_type, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Get the list of MIME types supported by this extractor.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function supported_mime_types(): mixed;
|
||||
|
||||
/**
|
||||
* Get the priority of this extractor.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function priority(): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Check if this extractor can handle a specific file.
|
||||
*
|
||||
|
||||
* @param mixed $_path
|
||||
* @param string $_mime_type
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function can_handle(mixed $_path, string $_mime_type): mixed;
|
||||
|
||||
}
|
||||
33
packages/php/src/EmbeddingBackend.php
Normal file
33
packages/php/src/EmbeddingBackend.php
Normal file
@@ -0,0 +1,33 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for EmbeddingBackend.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface EmbeddingBackend
|
||||
{
|
||||
|
||||
/**
|
||||
* Embedding vector dimension. Must be `> 0` and must match the length of
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function dimensions(): mixed;
|
||||
|
||||
/**
|
||||
* Embed a batch of texts, returning one vector per input in order.
|
||||
*
|
||||
|
||||
* @param mixed $texts
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function embed(mixed $texts): mixed;
|
||||
|
||||
}
|
||||
587
packages/php/src/Kreuzberg.php
Normal file
587
packages/php/src/Kreuzberg.php
Normal file
@@ -0,0 +1,587 @@
|
||||
<?php
|
||||
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
final class Kreuzberg
|
||||
{
|
||||
/**
|
||||
* Extract content from a byte array.
|
||||
*
|
||||
* This is the main entry point for in-memory extraction. It performs the following steps:
|
||||
* 1. Validate MIME type
|
||||
* 2. Handle legacy format conversion if needed
|
||||
* 3. Select appropriate extractor from registry
|
||||
* 4. Extract content
|
||||
* 5. Run post-processing pipeline
|
||||
*
|
||||
* @param string $content
|
||||
* @param string $mime_type
|
||||
* @param ExtractionConfig $config
|
||||
* @return ExtractionResult
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function extractBytes(
|
||||
string $content, string $mime_type, ?ExtractionConfig $config = null): ExtractionResult
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::extractBytes($content, $mime_type, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Extract content from a file.
|
||||
*
|
||||
* This is the main entry point for file-based extraction. It performs the following steps:
|
||||
* 1. Check cache for existing result (if caching enabled)
|
||||
* 2. Detect or validate MIME type
|
||||
* 3. Select appropriate extractor from registry
|
||||
* 4. Extract content
|
||||
* 5. Run post-processing pipeline
|
||||
* 6. Store result in cache (if caching enabled)
|
||||
*
|
||||
* @param string $path
|
||||
* @param ?string $mime_type
|
||||
* @param ExtractionConfig $config
|
||||
* @return ExtractionResult
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function extractFile(
|
||||
string $path, ?string $mime_type = null, ?ExtractionConfig $config = null): ExtractionResult
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::extractFile($path, $mime_type, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Synchronous wrapper for `extract_file`.
|
||||
*
|
||||
* This is a convenience function that blocks the current thread until extraction completes.
|
||||
* For async code, use `extract_file` directly.
|
||||
*
|
||||
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||||
* a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
|
||||
*
|
||||
* This function is only available with the `tokio-runtime` feature. For WASM targets,
|
||||
* use a truly synchronous extraction approach instead.
|
||||
*
|
||||
* @param string $path
|
||||
* @param ?string $mime_type
|
||||
* @param ExtractionConfig $config
|
||||
* @return ExtractionResult
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function extractFileSync(
|
||||
string $path, ?string $mime_type = null, ?ExtractionConfig $config = null): ExtractionResult
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::extractFileSync($path, $mime_type, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Synchronous wrapper for `extract_bytes`.
|
||||
*
|
||||
* Uses the global Tokio runtime for 100x+ performance improvement over creating
|
||||
* a new runtime per call.
|
||||
*
|
||||
* With the `tokio-runtime` feature, this blocks the current thread using the global
|
||||
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
|
||||
*
|
||||
* @param string $content
|
||||
* @param string $mime_type
|
||||
* @param ExtractionConfig $config
|
||||
* @return ExtractionResult
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function extractBytesSync(
|
||||
string $content, string $mime_type, ?ExtractionConfig $config = null): ExtractionResult
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::extractBytesSync($content, $mime_type, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Synchronous wrapper for `batch_extract_files`.
|
||||
*
|
||||
* Uses the global Tokio runtime for optimal performance.
|
||||
* Only available with `tokio-runtime` (WASM has no filesystem).
|
||||
*
|
||||
* @param array<BatchFileItem> $items
|
||||
* @param ExtractionConfig $config
|
||||
* @return array<ExtractionResult>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function batchExtractFilesSync(
|
||||
array $items, ?ExtractionConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::batchExtractFilesSync($items, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Synchronous wrapper for `batch_extract_bytes`.
|
||||
*
|
||||
* Uses the global Tokio runtime for optimal performance.
|
||||
* With the `tokio-runtime` feature, this blocks the current thread using the global
|
||||
* Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
|
||||
* that iterates through items and calls `extract_bytes_sync()`.
|
||||
*
|
||||
* @param array<BatchBytesItem> $items
|
||||
* @param ExtractionConfig $config
|
||||
* @return array<ExtractionResult>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function batchExtractBytesSync(
|
||||
array $items, ?ExtractionConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::batchExtractBytesSync($items, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Extract content from multiple files concurrently.
|
||||
*
|
||||
* This function processes multiple files in parallel, automatically managing
|
||||
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
||||
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||||
* to `(num_cpus * 1.5).ceil()`.
|
||||
*
|
||||
* Each file can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||||
* fields from the batch-level `config`. Pass `None` for a file to use the batch defaults.
|
||||
* Batch-level settings like `max_concurrent_extractions` and `use_cache` are always
|
||||
* taken from the batch-level `config`.
|
||||
*
|
||||
* @param array<BatchFileItem> $items
|
||||
* @param ExtractionConfig $config
|
||||
* @return array<ExtractionResult>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function batchExtractFiles(
|
||||
array $items, ?ExtractionConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::batchExtractFiles($items, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Extract content from multiple byte arrays concurrently.
|
||||
*
|
||||
* This function processes multiple byte arrays in parallel, automatically managing
|
||||
* concurrency to prevent resource exhaustion. The concurrency limit can be
|
||||
* configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
||||
* to `(num_cpus * 1.5).ceil()`.
|
||||
*
|
||||
* Each item can optionally specify a [`FileExtractionConfig`] that overrides specific
|
||||
* fields from the batch-level `config`. Pass `None` as the config to use
|
||||
* the batch-level defaults for that item.
|
||||
*
|
||||
* @param array<BatchBytesItem> $items
|
||||
* @param ExtractionConfig $config
|
||||
* @return array<ExtractionResult>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function batchExtractBytes(
|
||||
array $items, ?ExtractionConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::batchExtractBytes($items, $config ?? new ExtractionConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Detect MIME type from raw file bytes.
|
||||
*
|
||||
* Uses magic byte signatures to detect file type from content.
|
||||
* Falls back to `infer` crate for comprehensive detection.
|
||||
*
|
||||
* For ZIP-based files, inspects contents to distinguish Office Open XML
|
||||
* formats (DOCX, XLSX, PPTX) from plain ZIP archives.
|
||||
*
|
||||
* @param string $content
|
||||
* @return string
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function detectMimeTypeFromBytes(
|
||||
string $content): string
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::detectMimeTypeFromBytes($content); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Get file extensions for a given MIME type.
|
||||
*
|
||||
* Returns all known file extensions that map to the specified MIME type.
|
||||
*
|
||||
* @param string $mime_type
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function getExtensionsForMime(
|
||||
string $mime_type): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::getExtensionsForMime($mime_type); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List the names of all registered embedding backends.
|
||||
*
|
||||
* Used by `kreuzberg-cli`, the api/mcp endpoints, and generated language
|
||||
* bindings.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listEmbeddingBackends(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listEmbeddingBackends(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List names of all registered document extractors.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listDocumentExtractors(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listDocumentExtractors(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List all registered OCR backends.
|
||||
*
|
||||
* Returns the names of all OCR backends currently registered in the global registry.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listOcrBackends(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listOcrBackends(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List all registered post-processor names.
|
||||
*
|
||||
* Returns a vector of all post-processor names currently registered in the
|
||||
* global registry.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listPostProcessors(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listPostProcessors(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List names of all registered renderers.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listRenderers(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listRenderers(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List names of all registered validators.
|
||||
*
|
||||
* @return array<string>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function listValidators(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listValidators(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Compare two extraction results and return a structured diff.
|
||||
*
|
||||
* The comparison is purely structural — no I/O, no side effects. All fields
|
||||
* of [`ExtractionDiff`] are populated according to the provided [`DiffOptions`].
|
||||
*
|
||||
* @param ExtractionResult $a
|
||||
* @param ExtractionResult $b
|
||||
* @param DiffOptions $opts
|
||||
* @return ExtractionDiff
|
||||
*/
|
||||
public static function compare(
|
||||
ExtractionResult $a, ExtractionResult $b, DiffOptions $opts): ExtractionDiff
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::compare($a, $b, $opts); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Generate embeddings asynchronously for a list of text strings.
|
||||
*
|
||||
* This is the async counterpart to [`embed_texts`]. It offloads the blocking
|
||||
* ONNX inference work to a dedicated blocking thread pool via Tokio's
|
||||
* `spawn_blocking`, keeping the async executor free.
|
||||
*
|
||||
* Returns one embedding vector per input text in the same order.
|
||||
*
|
||||
* @param array<string> $texts
|
||||
* @param EmbeddingConfig $config
|
||||
* @return array<array<float>>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function embedTextsAsync(
|
||||
array $texts, ?EmbeddingConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::embedTextsAsync($texts, $config ?? new EmbeddingConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Render a single PDF page to PNG bytes.
|
||||
*
|
||||
* Returns raw PNG-encoded bytes for the specified page at the given DPI.
|
||||
* Uses pdf_oxide with tiny-skia for pure-Rust rendering.
|
||||
*
|
||||
* @param string $pdf_bytes
|
||||
* @param int $page_index
|
||||
* @param ?int $dpi
|
||||
* @param ?string $password
|
||||
* @return string
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function renderPdfPageToPng(
|
||||
string $pdf_bytes, int $page_index, ?int $dpi = null, ?string $password = null): string
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::renderPdfPageToPng($pdf_bytes, $page_index, $dpi, $password); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Detect the MIME type of a file at the given path.
|
||||
*
|
||||
* Uses the file extension and optionally the file content to determine the MIME type.
|
||||
* Set `check_exists` to `true` to verify the file exists before detection.
|
||||
*
|
||||
* @param string $path
|
||||
* @param bool $check_exists
|
||||
* @return string
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function detectMimeType(
|
||||
string $path, bool $check_exists): string
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::detectMimeType($path, $check_exists); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Embed a list of texts using the configured embedding model.
|
||||
*
|
||||
* Returns a 2D vector where each inner vector is the embedding for the corresponding text.
|
||||
*
|
||||
* @param array<string> $texts
|
||||
* @param EmbeddingConfig $config
|
||||
* @return array<array<float>>
|
||||
* @throws \Kreuzberg\KreuzbergException
|
||||
*/
|
||||
public static function embedTexts(
|
||||
array $texts, ?EmbeddingConfig $config = null): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::embedTexts($texts, $config ?? new EmbeddingConfig()); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* Get an embedding preset by name.
|
||||
*
|
||||
* Returns `None` if no preset with the given name exists. Returns an owned
|
||||
* clone so the value is safe to pass across FFI boundaries.
|
||||
*
|
||||
* @param string $name
|
||||
* @return ?EmbeddingPreset
|
||||
*/
|
||||
public static function getEmbeddingPreset(
|
||||
string $name): ?EmbeddingPreset
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::getEmbeddingPreset($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* List the names of all available embedding presets.
|
||||
*
|
||||
* Returns owned `String`s so the values are safe to pass across FFI boundaries.
|
||||
*
|
||||
* @return array<string>
|
||||
*/
|
||||
public static function listEmbeddingPresets(
|
||||
): array
|
||||
{
|
||||
return \Kreuzberg\KreuzbergApi::listEmbeddingPresets(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerOcrBackend.
|
||||
*
|
||||
* @param OcrBackend $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerOcrBackend(
|
||||
OcrBackend $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerOcrBackend($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterOcrBackend.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterOcrBackend(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterOcrBackend($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearOcrBackends.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearOcrBackends(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearOcrBackends(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerPostProcessor.
|
||||
*
|
||||
* @param PostProcessor $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerPostProcessor(
|
||||
PostProcessor $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerPostProcessor($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterPostProcessor.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterPostProcessor(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterPostProcessor($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearPostProcessors.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearPostProcessors(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearPostProcessors(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerValidator.
|
||||
*
|
||||
* @param Validator $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerValidator(
|
||||
Validator $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerValidator($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterValidator.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterValidator(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterValidator($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearValidators.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearValidators(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearValidators(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerEmbeddingBackend.
|
||||
*
|
||||
* @param EmbeddingBackend $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerEmbeddingBackend(
|
||||
EmbeddingBackend $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerEmbeddingBackend($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterEmbeddingBackend.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterEmbeddingBackend(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterEmbeddingBackend($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearEmbeddingBackends.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearEmbeddingBackends(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearEmbeddingBackends(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerDocumentExtractor.
|
||||
*
|
||||
* @param DocumentExtractor $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerDocumentExtractor(
|
||||
DocumentExtractor $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerDocumentExtractor($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterDocumentExtractor.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterDocumentExtractor(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterDocumentExtractor($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearDocumentExtractors.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearDocumentExtractors(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearDocumentExtractors(); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* registerRenderer.
|
||||
*
|
||||
* @param Renderer $backend
|
||||
* @return void
|
||||
*/
|
||||
public static function registerRenderer(
|
||||
Renderer $backend) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::registerRenderer($backend); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* unregisterRenderer.
|
||||
*
|
||||
* @param string $name
|
||||
* @return void
|
||||
*/
|
||||
public static function unregisterRenderer(
|
||||
string $name) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::unregisterRenderer($name); // delegate to native extension class
|
||||
}
|
||||
/**
|
||||
* clearRenderers.
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clearRenderers(
|
||||
) : void
|
||||
{
|
||||
\Kreuzberg\KreuzbergApi::clearRenderers(); // delegate to native extension class
|
||||
}
|
||||
}
|
||||
87
packages/php/src/OcrBackend.php
Normal file
87
packages/php/src/OcrBackend.php
Normal file
@@ -0,0 +1,87 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for OcrBackend.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface OcrBackend
|
||||
{
|
||||
|
||||
/**
|
||||
* Process an image and extract text via OCR.
|
||||
*
|
||||
|
||||
* @param mixed $image_bytes
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function process_image(mixed $image_bytes, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Process a file and extract text via OCR.
|
||||
*
|
||||
|
||||
* @param mixed $path
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function process_image_file(mixed $path, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Check if this backend supports a given language code.
|
||||
*
|
||||
|
||||
* @param string $lang
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function supports_language(string $lang): mixed;
|
||||
|
||||
/**
|
||||
* Get the backend type identifier.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function backend_type(): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Get a list of all supported languages.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function supported_languages(): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Check if the backend supports table detection.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function supports_table_detection(): mixed;
|
||||
|
||||
/**
|
||||
* Check if the backend supports direct document-level processing (e.g. for PDFs).
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function supports_document_processing(): mixed;
|
||||
|
||||
/**
|
||||
* Process a document file directly via OCR.
|
||||
*
|
||||
|
||||
* @param mixed $_path
|
||||
* @param mixed $_config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function process_document(mixed $_path, mixed $_config): mixed;
|
||||
|
||||
}
|
||||
61
packages/php/src/PostProcessor.php
Normal file
61
packages/php/src/PostProcessor.php
Normal file
@@ -0,0 +1,61 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for PostProcessor.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface PostProcessor
|
||||
{
|
||||
|
||||
/**
|
||||
* Process an extraction result.
|
||||
*
|
||||
|
||||
* @param mixed $result
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function process(mixed $result, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Get the processing stage for this post-processor.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function processing_stage(): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Check if this processor should run for a given result.
|
||||
*
|
||||
|
||||
* @param mixed $_result
|
||||
* @param mixed $_config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function should_process(mixed $_result, mixed $_config): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Estimate processing time in milliseconds.
|
||||
*
|
||||
|
||||
* @param mixed $_result
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function estimated_duration_ms(mixed $_result): mixed;
|
||||
|
||||
/**
|
||||
* Execution priority within the processing stage.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function priority(): mixed;
|
||||
|
||||
}
|
||||
25
packages/php/src/Renderer.php
Normal file
25
packages/php/src/Renderer.php
Normal file
@@ -0,0 +1,25 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for Renderer.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface Renderer
|
||||
{
|
||||
|
||||
/**
|
||||
* Render an `InternalDocument` to the output format.
|
||||
*
|
||||
|
||||
* @param mixed $doc
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function render(mixed $doc): mixed;
|
||||
|
||||
}
|
||||
44
packages/php/src/Validator.php
Normal file
44
packages/php/src/Validator.php
Normal file
@@ -0,0 +1,44 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace Kreuzberg;
|
||||
|
||||
/**
|
||||
* Plugin interface for Validator.
|
||||
*
|
||||
* Implement this interface and register an instance with the corresponding
|
||||
* registration function to provide custom behavior for extraction.
|
||||
*/
|
||||
interface Validator
|
||||
{
|
||||
|
||||
/**
|
||||
* Validate an extraction result.
|
||||
*
|
||||
|
||||
* @param mixed $result
|
||||
* @param mixed $config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function validate(mixed $result, mixed $config): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Check if this validator should run for a given result.
|
||||
*
|
||||
|
||||
* @param mixed $_result
|
||||
* @param mixed $_config
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function should_validate(mixed $_result, mixed $_config): mixed;
|
||||
|
||||
/**
|
||||
* Optional: Get the validation priority.
|
||||
*
|
||||
|
||||
* @return mixed Return value from the plugin method
|
||||
*/
|
||||
public function priority(): mixed;
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user