This commit is contained in:
15
docs/snippets/python/plugins/clear_plugins.md
Normal file
15
docs/snippets/python/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
clear_document_extractors,
|
||||
clear_post_processors,
|
||||
clear_ocr_backends,
|
||||
clear_validators,
|
||||
)
|
||||
|
||||
clear_post_processors()
|
||||
clear_validators()
|
||||
clear_ocr_backends()
|
||||
clear_document_extractors()
|
||||
|
||||
print("All plugins cleared")
|
||||
```
|
||||
44
docs/snippets/python/plugins/embedding_backend.md
Normal file
44
docs/snippets/python/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_embedding_backend, EmbeddingConfig, embed_texts
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
# Wrap an already-loaded embedder (e.g. sentence-transformers, llama-cpp-python,
|
||||
# or a tuned ONNX session) so kreuzberg can call back into it during chunking
|
||||
# and standalone embed requests.
|
||||
class MyEmbedder:
|
||||
def __init__(self):
|
||||
self._model = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
||||
|
||||
# Plugin trait hooks
|
||||
def name(self) -> str:
|
||||
return "my-embedder"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def initialize(self) -> None:
|
||||
# Optional warm-up; runs once at registration before dimensions() is cached.
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
# EmbeddingBackend hooks
|
||||
def dimensions(self) -> int:
|
||||
# Captured once at registration; the dispatcher uses this for shape validation.
|
||||
return self._model.get_sentence_embedding_dimension()
|
||||
|
||||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||||
return self._model.encode(texts).tolist()
|
||||
|
||||
|
||||
# Register once at startup. Reference by name in config.
|
||||
register_embedding_backend(MyEmbedder())
|
||||
|
||||
config: EmbeddingConfig = {
|
||||
"model": {"type": "plugin", "name": "my-embedder"},
|
||||
# Optional: bound the wait on a hung backend (default: 60s; None disables)
|
||||
"max_embed_duration_secs": 30,
|
||||
}
|
||||
vectors = embed_texts(["Hello, world!", "Second text"], config)
|
||||
```
|
||||
14
docs/snippets/python/plugins/extractor_registration.md
Normal file
14
docs/snippets/python/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_document_extractor
|
||||
|
||||
class CustomExtractor:
|
||||
def name(self) -> str:
|
||||
return "custom"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
extractor = CustomExtractor()
|
||||
register_document_extractor(extractor)
|
||||
print("Extractor registered")
|
||||
```
|
||||
18
docs/snippets/python/plugins/list_plugins.md
Normal file
18
docs/snippets/python/plugins/list_plugins.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
list_document_extractors,
|
||||
list_post_processors,
|
||||
list_ocr_backends,
|
||||
list_validators,
|
||||
)
|
||||
|
||||
extractors: list[str] = list_document_extractors()
|
||||
processors: list[str] = list_post_processors()
|
||||
ocr_backends: list[str] = list_ocr_backends()
|
||||
validators: list[str] = list_validators()
|
||||
|
||||
print(f"Extractors: {extractors}")
|
||||
print(f"Processors: {processors}")
|
||||
print(f"OCR backends: {ocr_backends}")
|
||||
print(f"Validators: {validators}")
|
||||
```
|
||||
33
docs/snippets/python/plugins/min_length_validator.md
Normal file
33
docs/snippets/python/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_validator, ExtractionResult, ValidationError
|
||||
|
||||
class MinLengthValidator:
|
||||
def __init__(self, min_length: int = 100):
|
||||
self.min_length: int = min_length
|
||||
|
||||
def name(self) -> str:
|
||||
return "min_length_validator"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def priority(self) -> int:
|
||||
return 100
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
content_len: int = len(result.content)
|
||||
if content_len < self.min_length:
|
||||
raise ValidationError(f"Content too short: {content_len}")
|
||||
|
||||
def should_validate(self, result: ExtractionResult) -> bool:
|
||||
return True
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
validator: MinLengthValidator = MinLengthValidator(min_length=100)
|
||||
register_validator(validator)
|
||||
```
|
||||
42
docs/snippets/python/plugins/pdf_metadata_extractor.md
Normal file
42
docs/snippets/python/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfMetadataExtractor:
|
||||
def __init__(self):
|
||||
self.processed_count: int = 0
|
||||
|
||||
def name(self) -> str:
|
||||
return "pdf_metadata_extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def description(self) -> str:
|
||||
return "Extracts and enriches PDF metadata"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
self.processed_count += 1
|
||||
result.metadata["pdf_processed"] = True
|
||||
result.metadata["pdf_order"] = self.processed_count
|
||||
result.metadata["content_length"] = len(result.content)
|
||||
result.metadata["pdf_processor_version"] = "1.0.0"
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info("PDF metadata extractor initialized")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Processed {self.processed_count} PDFs")
|
||||
|
||||
processor: PdfMetadataExtractor = PdfMetadataExtractor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
19
docs/snippets/python/plugins/pdf_only_processor.md
Normal file
19
docs/snippets/python/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionResult, register_post_processor
|
||||
|
||||
class PdfOnlyProcessor:
|
||||
def name(self) -> str:
|
||||
return "pdf-only-processor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
return result
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
processor: PdfOnlyProcessor = PdfOnlyProcessor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_document_extractor, ExtractionResult
|
||||
import json
|
||||
|
||||
class CustomJsonExtractor:
|
||||
def name(self) -> str:
|
||||
return "custom-json-extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def supported_mime_types(self) -> list[str]:
|
||||
return ["application/json"]
|
||||
|
||||
def priority(self) -> int:
|
||||
return 50
|
||||
|
||||
def extract_bytes(
|
||||
self, content: bytes, mime_type: str, config: dict
|
||||
) -> ExtractionResult:
|
||||
data: dict = json.loads(content)
|
||||
text: str = self._extract_text(data)
|
||||
return {"content": text, "mime_type": "application/json"}
|
||||
|
||||
def _extract_text(self, obj: object) -> str:
|
||||
if isinstance(obj, str):
|
||||
return f"{obj}\n"
|
||||
if isinstance(obj, list):
|
||||
return "".join(self._extract_text(item) for item in obj)
|
||||
if isinstance(obj, dict):
|
||||
return "".join(self._extract_text(v) for v in obj.values())
|
||||
return ""
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
extractor: CustomJsonExtractor = CustomJsonExtractor()
|
||||
register_document_extractor(extractor)
|
||||
```
|
||||
27
docs/snippets/python/plugins/plugin_logging.md
Normal file
27
docs/snippets/python/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MyPlugin:
|
||||
def name(self) -> str:
|
||||
return "my-plugin"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info(f"Initializing plugin: {self.name()}")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Shutting down plugin: {self.name()}")
|
||||
|
||||
def extract_bytes(
|
||||
self, content: bytes, mime_type: str, config: dict
|
||||
) -> dict:
|
||||
logger.info(f"Extracting {mime_type} ({len(content)} bytes)")
|
||||
result: dict = {"content": "", "mime_type": mime_type}
|
||||
if not result["content"]:
|
||||
logger.warning("Extraction resulted in empty content")
|
||||
return result
|
||||
```
|
||||
14
docs/snippets/python/plugins/plugin_testing.md
Normal file
14
docs/snippets/python/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import pytest
|
||||
from kreuzberg import ExtractionResult
|
||||
|
||||
def test_custom_extractor() -> None:
|
||||
extractor = CustomJsonExtractor()
|
||||
json_data: bytes = b'{"message": "Hello, world!"}'
|
||||
config: dict = {}
|
||||
result: ExtractionResult = extractor.extract_bytes(
|
||||
json_data, "application/json", config
|
||||
)
|
||||
assert "Hello, world!" in result.content
|
||||
assert result.mime_type == "application/json"
|
||||
```
|
||||
34
docs/snippets/python/plugins/plugin_validator.md
Normal file
34
docs/snippets/python/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionResult,
|
||||
ValidationError,
|
||||
extract_file_sync,
|
||||
register_validator,
|
||||
)
|
||||
|
||||
class MinLengthValidator:
|
||||
def name(self) -> str:
|
||||
return "min_length"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
if len(result.content) < 50:
|
||||
raise ValidationError(f"Content too short: {len(result.content)}")
|
||||
|
||||
def should_validate(self, result: ExtractionResult) -> bool:
|
||||
return True
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
validator: MinLengthValidator = MinLengthValidator()
|
||||
register_validator(validator)
|
||||
|
||||
result = extract_file_sync("document.pdf")
|
||||
print(f"Content length: {len(result.content)}")
|
||||
```
|
||||
20
docs/snippets/python/plugins/quality_score_validator.md
Normal file
20
docs/snippets/python/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionResult, ValidationError, register_validator
|
||||
|
||||
class QualityValidator:
|
||||
def name(self) -> str:
|
||||
return "quality-validator"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def validate(self, result: ExtractionResult) -> None:
|
||||
score: float = result.metadata.get("quality_score", 0.0)
|
||||
if score < 0.5:
|
||||
raise ValidationError(
|
||||
f"Quality score too low: {score:.2f}"
|
||||
)
|
||||
|
||||
validator: QualityValidator = QualityValidator()
|
||||
register_validator(validator)
|
||||
```
|
||||
28
docs/snippets/python/plugins/stateful_plugin.md
Normal file
28
docs/snippets/python/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```python title="Python"
|
||||
import threading
|
||||
from kreuzberg import ExtractionResult
|
||||
|
||||
class StatefulPlugin:
|
||||
def __init__(self):
|
||||
self.lock: threading.Lock = threading.Lock()
|
||||
self.call_count: int = 0
|
||||
self.cache: dict = {}
|
||||
|
||||
def name(self) -> str:
|
||||
return "stateful-plugin"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
with self.lock:
|
||||
self.call_count += 1
|
||||
self.cache["last_mime"] = result.mime_type
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
```
|
||||
20
docs/snippets/python/plugins/unregister_plugins.md
Normal file
20
docs/snippets/python/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
unregister_document_extractor,
|
||||
unregister_post_processor,
|
||||
unregister_ocr_backend,
|
||||
unregister_validator,
|
||||
)
|
||||
|
||||
names: list[str] = [
|
||||
"custom-json-extractor",
|
||||
"word_count",
|
||||
"cloud-ocr",
|
||||
"min_length_validator",
|
||||
]
|
||||
|
||||
unregister_document_extractor(names[0])
|
||||
unregister_post_processor(names[1])
|
||||
unregister_ocr_backend(names[2])
|
||||
unregister_validator(names[3])
|
||||
```
|
||||
30
docs/snippets/python/plugins/word_count_processor.md
Normal file
30
docs/snippets/python/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
|
||||
class WordCountProcessor:
|
||||
def name(self) -> str:
|
||||
return "word_count"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
word_count: int = len(result.content.split())
|
||||
result.metadata["word_count"] = word_count
|
||||
return result
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return bool(result.content)
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
processor: WordCountProcessor = WordCountProcessor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
Reference in New Issue
Block a user