Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/e2e/python/tests/test_smoke.py
+++ b/e2e/python/tests/test_smoke.py
@@ -0,0 +1,148 @@
+# This file is auto-generated by alef — DO NOT EDIT.
+# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
+# To regenerate: alef generate
+# To verify freshness: alef verify --exit-code
+# Issues & docs: https://github.com/kreuzberg-dev/alef
+
+"""E2e tests for category: smoke."""
+
+from pathlib import Path
+import pytest  # noqa: F401
+from kreuzberg import extract_bytes, extract_file, ExtractionConfig
+
+
+def _alef_e2e_text(value: object) -> str:
+    return "" if value is None else str(value)
+
+
+def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
+    raw_items = getattr(item, "items", None)
+    items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
+    return (
+        _alef_e2e_text(item),
+        _alef_e2e_text(getattr(item, "kind", None)),
+        _alef_e2e_text(getattr(item, "name", None)),
+        _alef_e2e_text(getattr(item, "source", None)),
+        _alef_e2e_text(getattr(item, "alias", None)),
+        _alef_e2e_text(getattr(item, "text", None)),
+        _alef_e2e_text(getattr(item, "signature", None)),
+        items_text,
+    )
+
+
+@pytest.mark.asyncio
+
+async def test_ocr_image_png() -> None:
+    """OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge."""
+    content = Path("images/test_hello_world.png").read_bytes()
+    mime_type = "image/png"
+    config = ExtractionConfig()
+
+    result = await extract_bytes(content, mime_type, config)
+    assert result.mime_type.strip() == "image/png"  # noqa: S101
+    assert len(result.content) >= 1  # noqa: S101
+    assert any(v in result.content for v in ["Hello", "World", "hello", "world"])  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_docx_basic() -> None:
+    """Smoke test: DOCX with formatted text."""
+    path = "docx/fake.docx"
+    mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"  # noqa: S101
+    assert len(result.content) >= 20  # noqa: S101
+    assert any(v in result.content for v in ["Lorem", "ipsum", "document", "text"])  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_html_basic() -> None:
+    """Smoke test: HTML table extraction."""
+    path = "html/simple_table.html"
+    mime_type = "text/html"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "text/html"  # noqa: S101
+    assert len(result.content) >= 10  # noqa: S101
+    assert any(v in result.content for v in ["Sample Data Table", "Laptop", "Electronics", "Product"])  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_image_png() -> None:
+    """Smoke test: PNG image (without OCR, metadata only)."""
+    path = "images/sample.png"
+    config = ExtractionConfig(disable_ocr=True)
+
+    result = await extract_file(path, None, config)
+    assert result.mime_type.strip() == "image/png"  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_json_basic() -> None:
+    """Smoke test: JSON file extraction."""
+    path = "json/simple.json"
+    mime_type = "application/json"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "application/json"  # noqa: S101
+    assert len(result.content) >= 5  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_pdf_basic() -> None:
+    """Smoke test: PDF with simple text extraction."""
+    path = "pdf/fake_memo.pdf"
+    mime_type = "application/pdf"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "application/pdf"  # noqa: S101
+    assert len(result.content) >= 50  # noqa: S101
+    assert any(v in result.content for v in ["May 5, 2023", "To Whom it May Concern"])  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_txt_basic() -> None:
+    """Smoke test: Plain text file."""
+    path = "text/report.txt"
+    mime_type = "text/plain"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "text/plain"  # noqa: S101
+    assert len(result.content) >= 5  # noqa: S101
+
+
+@pytest.mark.asyncio
+
+async def test_smoke_xlsx_basic() -> None:
+    """Smoke test: XLSX with basic spreadsheet data including tables."""
+    path = "xlsx/stanley_cups.xlsx"
+    mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    config = ExtractionConfig()
+
+    result = await extract_file(path, mime_type, config)
+    assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"  # noqa: S101
+    assert len(result.content) >= 100  # noqa: S101
+    assert "Team" in result.content  # noqa: S101
+    assert "Location" in result.content  # noqa: S101
+    assert "Stanley Cups" in result.content  # noqa: S101
+    assert "Blues" in result.content  # noqa: S101
+    assert "Flyers" in result.content  # noqa: S101
+    assert "Maple Leafs" in result.content  # noqa: S101
+    assert "STL" in result.content  # noqa: S101
+    assert "PHI" in result.content  # noqa: S101
+    assert "TOR" in result.content  # noqa: S101
+    # skipped: field 'tables' not available on result type
+    # skipped: field 'metadata.format.excel.sheet_count' not available on result type
+    # skipped: field 'metadata.format.excel.sheet_names' not available on result type