This commit is contained in:
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
42
docs/snippets/python/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_document_extractor, ExtractionResult
|
||||
import json
|
||||
|
||||
class CustomJsonExtractor:
|
||||
def name(self) -> str:
|
||||
return "custom-json-extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def supported_mime_types(self) -> list[str]:
|
||||
return ["application/json"]
|
||||
|
||||
def priority(self) -> int:
|
||||
return 50
|
||||
|
||||
def extract_bytes(
|
||||
self, content: bytes, mime_type: str, config: dict
|
||||
) -> ExtractionResult:
|
||||
data: dict = json.loads(content)
|
||||
text: str = self._extract_text(data)
|
||||
return {"content": text, "mime_type": "application/json"}
|
||||
|
||||
def _extract_text(self, obj: object) -> str:
|
||||
if isinstance(obj, str):
|
||||
return f"{obj}\n"
|
||||
if isinstance(obj, list):
|
||||
return "".join(self._extract_text(item) for item in obj)
|
||||
if isinstance(obj, dict):
|
||||
return "".join(self._extract_text(v) for v in obj.values())
|
||||
return ""
|
||||
|
||||
def initialize(self) -> None:
|
||||
pass
|
||||
|
||||
def shutdown(self) -> None:
|
||||
pass
|
||||
|
||||
extractor: CustomJsonExtractor = CustomJsonExtractor()
|
||||
register_document_extractor(extractor)
|
||||
```
|
||||
Reference in New Issue
Block a user