1.2 KiB
1.2 KiB
from kreuzberg import register_document_extractor, ExtractionResult
import json
class CustomJsonExtractor:
def name(self) -> str:
return "custom-json-extractor"
def version(self) -> str:
return "1.0.0"
def supported_mime_types(self) -> list[str]:
return ["application/json"]
def priority(self) -> int:
return 50
def extract_bytes(
self, content: bytes, mime_type: str, config: dict
) -> ExtractionResult:
data: dict = json.loads(content)
text: str = self._extract_text(data)
return {"content": text, "mime_type": "application/json"}
def _extract_text(self, obj: object) -> str:
if isinstance(obj, str):
return f"{obj}\n"
if isinstance(obj, list):
return "".join(self._extract_text(item) for item in obj)
if isinstance(obj, dict):
return "".join(self._extract_text(v) for v in obj.values())
return ""
def initialize(self) -> None:
pass
def shutdown(self) -> None:
pass
extractor: CustomJsonExtractor = CustomJsonExtractor()
register_document_extractor(extractor)