# This file is auto-generated by alef — DO NOT EDIT. # alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 # To regenerate: alef generate # To verify freshness: alef verify --exit-code # Issues & docs: https://github.com/kreuzberg-dev/alef """E2e tests for category: format_specific.""" from pathlib import Path import pytest # noqa: F401 from kreuzberg import extract_bytes_sync, extract_file_sync def _alef_e2e_text(value: object) -> str: return "" if value is None else str(value) def _alef_e2e_item_texts(item: object) -> tuple[str, ...]: raw_items = getattr(item, "items", None) items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else "" return ( _alef_e2e_text(item), _alef_e2e_text(getattr(item, "kind", None)), _alef_e2e_text(getattr(item, "name", None)), _alef_e2e_text(getattr(item, "source", None)), _alef_e2e_text(getattr(item, "alias", None)), _alef_e2e_text(getattr(item, "text", None)), _alef_e2e_text(getattr(item, "signature", None)), items_text, ) def test_format_docx_standalone() -> None: """Standalone DOCX extraction using extract_bytes_sync.""" content = Path("docx/fake.docx").read_bytes() mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" result = extract_bytes_sync(content, mime_type, None) assert len(result.content) >= 20 # noqa: S101 def test_format_hwpx_standalone() -> None: """Standalone HWPX extraction using extract_bytes_sync.""" content = Path("hwpx/simple.hwpx").read_bytes() mime_type = "application/haansofthwpx" result = extract_bytes_sync(content, mime_type, None) assert len(result.content) >= 20 # noqa: S101 assert "Hello from HWPX" in result.content # noqa: S101 def test_format_pdf_text() -> None: """Standalone PDF text extraction using extract_bytes_sync.""" content = Path("pdf/fake_memo.pdf").read_bytes() mime_type = "application/pdf" result = extract_bytes_sync(content, mime_type, None) assert len(result.content) >= 50 # noqa: S101 assert any(v in result.content for v in ["Mallori", "May"]) # noqa: S101 def test_format_pptx() -> None: """PPTX presentation extraction using extract_file_sync.""" path = "pptx/simple.pptx" mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" _ = extract_file_sync(path, mime_type, None) def test_format_xlsx() -> None: """XLSX spreadsheet extraction using extract_file_sync.""" path = "xlsx/stanley_cups.xlsx" mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" _ = extract_file_sync(path, mime_type, None)