# This file is auto-generated by alef — DO NOT EDIT. # alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 # To regenerate: alef generate # To verify freshness: alef verify --exit-code # Issues & docs: https://github.com/kreuzberg-dev/alef """E2e tests for category: smoke.""" from pathlib import Path import pytest # noqa: F401 from kreuzberg import extract_bytes, extract_file, ExtractionConfig def _alef_e2e_text(value: object) -> str: return "" if value is None else str(value) def _alef_e2e_item_texts(item: object) -> tuple[str, ...]: raw_items = getattr(item, "items", None) items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else "" return ( _alef_e2e_text(item), _alef_e2e_text(getattr(item, "kind", None)), _alef_e2e_text(getattr(item, "name", None)), _alef_e2e_text(getattr(item, "source", None)), _alef_e2e_text(getattr(item, "alias", None)), _alef_e2e_text(getattr(item, "text", None)), _alef_e2e_text(getattr(item, "signature", None)), items_text, ) @pytest.mark.asyncio async def test_ocr_image_png() -> None: """OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.""" content = Path("images/test_hello_world.png").read_bytes() mime_type = "image/png" config = ExtractionConfig() result = await extract_bytes(content, mime_type, config) assert result.mime_type.strip() == "image/png" # noqa: S101 assert len(result.content) >= 1 # noqa: S101 assert any(v in result.content for v in ["Hello", "World", "hello", "world"]) # noqa: S101 @pytest.mark.asyncio async def test_smoke_docx_basic() -> None: """Smoke test: DOCX with formatted text.""" path = "docx/fake.docx" mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: S101 assert len(result.content) >= 20 # noqa: S101 assert any(v in result.content for v in ["Lorem", "ipsum", "document", "text"]) # noqa: S101 @pytest.mark.asyncio async def test_smoke_html_basic() -> None: """Smoke test: HTML table extraction.""" path = "html/simple_table.html" mime_type = "text/html" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "text/html" # noqa: S101 assert len(result.content) >= 10 # noqa: S101 assert any(v in result.content for v in ["Sample Data Table", "Laptop", "Electronics", "Product"]) # noqa: S101 @pytest.mark.asyncio async def test_smoke_image_png() -> None: """Smoke test: PNG image (without OCR, metadata only).""" path = "images/sample.png" config = ExtractionConfig(disable_ocr=True) result = await extract_file(path, None, config) assert result.mime_type.strip() == "image/png" # noqa: S101 @pytest.mark.asyncio async def test_smoke_json_basic() -> None: """Smoke test: JSON file extraction.""" path = "json/simple.json" mime_type = "application/json" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "application/json" # noqa: S101 assert len(result.content) >= 5 # noqa: S101 @pytest.mark.asyncio async def test_smoke_pdf_basic() -> None: """Smoke test: PDF with simple text extraction.""" path = "pdf/fake_memo.pdf" mime_type = "application/pdf" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "application/pdf" # noqa: S101 assert len(result.content) >= 50 # noqa: S101 assert any(v in result.content for v in ["May 5, 2023", "To Whom it May Concern"]) # noqa: S101 @pytest.mark.asyncio async def test_smoke_txt_basic() -> None: """Smoke test: Plain text file.""" path = "text/report.txt" mime_type = "text/plain" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "text/plain" # noqa: S101 assert len(result.content) >= 5 # noqa: S101 @pytest.mark.asyncio async def test_smoke_xlsx_basic() -> None: """Smoke test: XLSX with basic spreadsheet data including tables.""" path = "xlsx/stanley_cups.xlsx" mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" config = ExtractionConfig() result = await extract_file(path, mime_type, config) assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # noqa: S101 assert len(result.content) >= 100 # noqa: S101 assert "Team" in result.content # noqa: S101 assert "Location" in result.content # noqa: S101 assert "Stanley Cups" in result.content # noqa: S101 assert "Blues" in result.content # noqa: S101 assert "Flyers" in result.content # noqa: S101 assert "Maple Leafs" in result.content # noqa: S101 assert "STL" in result.content # noqa: S101 assert "PHI" in result.content # noqa: S101 assert "TOR" in result.content # noqa: S101 # skipped: field 'tables' not available on result type # skipped: field 'metadata.format.excel.sheet_count' not available on result type # skipped: field 'metadata.format.excel.sheet_names' not available on result type