Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
```python title="Python"
from kreuzberg import (
clear_document_extractors,
clear_post_processors,
clear_ocr_backends,
clear_validators,
)
clear_post_processors()
clear_validators()
clear_ocr_backends()
clear_document_extractors()
print("All plugins cleared")
```

View File

@@ -0,0 +1,44 @@
```python title="Python"
from kreuzberg import register_embedding_backend, EmbeddingConfig, embed_texts
from sentence_transformers import SentenceTransformer
# Wrap an already-loaded embedder (e.g. sentence-transformers, llama-cpp-python,
# or a tuned ONNX session) so kreuzberg can call back into it during chunking
# and standalone embed requests.
class MyEmbedder:
def __init__(self):
self._model = SentenceTransformer("BAAI/bge-base-en-v1.5")
# Plugin trait hooks
def name(self) -> str:
return "my-embedder"
def version(self) -> str:
return "1.0.0"
def initialize(self) -> None:
# Optional warm-up; runs once at registration before dimensions() is cached.
pass
def shutdown(self) -> None:
pass
# EmbeddingBackend hooks
def dimensions(self) -> int:
# Captured once at registration; the dispatcher uses this for shape validation.
return self._model.get_sentence_embedding_dimension()
def embed(self, texts: list[str]) -> list[list[float]]:
return self._model.encode(texts).tolist()
# Register once at startup. Reference by name in config.
register_embedding_backend(MyEmbedder())
config: EmbeddingConfig = {
"model": {"type": "plugin", "name": "my-embedder"},
# Optional: bound the wait on a hung backend (default: 60s; None disables)
"max_embed_duration_secs": 30,
}
vectors = embed_texts(["Hello, world!", "Second text"], config)
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
from kreuzberg import register_document_extractor
class CustomExtractor:
def name(self) -> str:
return "custom"
def version(self) -> str:
return "1.0.0"
extractor = CustomExtractor()
register_document_extractor(extractor)
print("Extractor registered")
```

View File

@@ -0,0 +1,18 @@
```python title="Python"
from kreuzberg import (
list_document_extractors,
list_post_processors,
list_ocr_backends,
list_validators,
)
extractors: list[str] = list_document_extractors()
processors: list[str] = list_post_processors()
ocr_backends: list[str] = list_ocr_backends()
validators: list[str] = list_validators()
print(f"Extractors: {extractors}")
print(f"Processors: {processors}")
print(f"OCR backends: {ocr_backends}")
print(f"Validators: {validators}")
```

View File

@@ -0,0 +1,33 @@
```python title="Python"
from kreuzberg import register_validator, ExtractionResult, ValidationError
class MinLengthValidator:
def __init__(self, min_length: int = 100):
self.min_length: int = min_length
def name(self) -> str:
return "min_length_validator"
def version(self) -> str:
return "1.0.0"
def priority(self) -> int:
return 100
def validate(self, result: ExtractionResult) -> None:
content_len: int = len(result.content)
if content_len < self.min_length:
raise ValidationError(f"Content too short: {content_len}")
def should_validate(self, result: ExtractionResult) -> bool:
return True
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
validator: MinLengthValidator = MinLengthValidator(min_length=100)
register_validator(validator)
```

View File

@@ -0,0 +1,42 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
import logging
logger = logging.getLogger(__name__)
class PdfMetadataExtractor:
def __init__(self):
self.processed_count: int = 0
def name(self) -> str:
return "pdf_metadata_extractor"
def version(self) -> str:
return "1.0.0"
def description(self) -> str:
return "Extracts and enriches PDF metadata"
def processing_stage(self) -> str:
return "early"
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
def process(self, result: ExtractionResult) -> ExtractionResult:
self.processed_count += 1
result.metadata["pdf_processed"] = True
result.metadata["pdf_order"] = self.processed_count
result.metadata["content_length"] = len(result.content)
result.metadata["pdf_processor_version"] = "1.0.0"
return result
def initialize(self) -> None:
logger.info("PDF metadata extractor initialized")
def shutdown(self) -> None:
logger.info(f"Processed {self.processed_count} PDFs")
processor: PdfMetadataExtractor = PdfMetadataExtractor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,19 @@
```python title="Python"
from kreuzberg import ExtractionResult, register_post_processor
class PdfOnlyProcessor:
def name(self) -> str:
return "pdf-only-processor"
def version(self) -> str:
return "1.0.0"
def process(self, result: ExtractionResult) -> ExtractionResult:
return result
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
processor: PdfOnlyProcessor = PdfOnlyProcessor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,42 @@
```python title="Python"
from kreuzberg import register_document_extractor, ExtractionResult
import json
class CustomJsonExtractor:
def name(self) -> str:
return "custom-json-extractor"
def version(self) -> str:
return "1.0.0"
def supported_mime_types(self) -> list[str]:
return ["application/json"]
def priority(self) -> int:
return 50
def extract_bytes(
self, content: bytes, mime_type: str, config: dict
) -> ExtractionResult:
data: dict = json.loads(content)
text: str = self._extract_text(data)
return {"content": text, "mime_type": "application/json"}
def _extract_text(self, obj: object) -> str:
if isinstance(obj, str):
return f"{obj}\n"
if isinstance(obj, list):
return "".join(self._extract_text(item) for item in obj)
if isinstance(obj, dict):
return "".join(self._extract_text(v) for v in obj.values())
return ""
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
extractor: CustomJsonExtractor = CustomJsonExtractor()
register_document_extractor(extractor)
```

View File

@@ -0,0 +1,27 @@
```python title="Python"
import logging
logger = logging.getLogger(__name__)
class MyPlugin:
def name(self) -> str:
return "my-plugin"
def version(self) -> str:
return "1.0.0"
def initialize(self) -> None:
logger.info(f"Initializing plugin: {self.name()}")
def shutdown(self) -> None:
logger.info(f"Shutting down plugin: {self.name()}")
def extract_bytes(
self, content: bytes, mime_type: str, config: dict
) -> dict:
logger.info(f"Extracting {mime_type} ({len(content)} bytes)")
result: dict = {"content": "", "mime_type": mime_type}
if not result["content"]:
logger.warning("Extraction resulted in empty content")
return result
```

View File

@@ -0,0 +1,14 @@
```python title="Python"
import pytest
from kreuzberg import ExtractionResult
def test_custom_extractor() -> None:
extractor = CustomJsonExtractor()
json_data: bytes = b'{"message": "Hello, world!"}'
config: dict = {}
result: ExtractionResult = extractor.extract_bytes(
json_data, "application/json", config
)
assert "Hello, world!" in result.content
assert result.mime_type == "application/json"
```

View File

@@ -0,0 +1,34 @@
```python title="Python"
from kreuzberg import (
ExtractionResult,
ValidationError,
extract_file_sync,
register_validator,
)
class MinLengthValidator:
def name(self) -> str:
return "min_length"
def version(self) -> str:
return "1.0.0"
def validate(self, result: ExtractionResult) -> None:
if len(result.content) < 50:
raise ValidationError(f"Content too short: {len(result.content)}")
def should_validate(self, result: ExtractionResult) -> bool:
return True
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
validator: MinLengthValidator = MinLengthValidator()
register_validator(validator)
result = extract_file_sync("document.pdf")
print(f"Content length: {len(result.content)}")
```

View File

@@ -0,0 +1,20 @@
```python title="Python"
from kreuzberg import ExtractionResult, ValidationError, register_validator
class QualityValidator:
def name(self) -> str:
return "quality-validator"
def version(self) -> str:
return "1.0.0"
def validate(self, result: ExtractionResult) -> None:
score: float = result.metadata.get("quality_score", 0.0)
if score < 0.5:
raise ValidationError(
f"Quality score too low: {score:.2f}"
)
validator: QualityValidator = QualityValidator()
register_validator(validator)
```

View File

@@ -0,0 +1,28 @@
```python title="Python"
import threading
from kreuzberg import ExtractionResult
class StatefulPlugin:
def __init__(self):
self.lock: threading.Lock = threading.Lock()
self.call_count: int = 0
self.cache: dict = {}
def name(self) -> str:
return "stateful-plugin"
def version(self) -> str:
return "1.0.0"
def process(self, result: ExtractionResult) -> ExtractionResult:
with self.lock:
self.call_count += 1
self.cache["last_mime"] = result.mime_type
return result
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
```

View File

@@ -0,0 +1,20 @@
```python title="Python"
from kreuzberg import (
unregister_document_extractor,
unregister_post_processor,
unregister_ocr_backend,
unregister_validator,
)
names: list[str] = [
"custom-json-extractor",
"word_count",
"cloud-ocr",
"min_length_validator",
]
unregister_document_extractor(names[0])
unregister_post_processor(names[1])
unregister_ocr_backend(names[2])
unregister_validator(names[3])
```

View File

@@ -0,0 +1,30 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
class WordCountProcessor:
def name(self) -> str:
return "word_count"
def version(self) -> str:
return "1.0.0"
def processing_stage(self) -> str:
return "early"
def process(self, result: ExtractionResult) -> ExtractionResult:
word_count: int = len(result.content.split())
result.metadata["word_count"] = word_count
return result
def should_process(self, result: ExtractionResult) -> bool:
return bool(result.content)
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
processor: WordCountProcessor = WordCountProcessor()
register_post_processor(processor)
```