This commit is contained in:
1
e2e/python/tests/__init__.py
generated
Normal file
1
e2e/python/tests/__init__.py
generated
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
64
e2e/python/tests/test_async.py
generated
Normal file
64
e2e/python/tests/test_async.py
generated
Normal file
@@ -0,0 +1,64 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: async."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_bytes, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_async_extract_bytes() -> None:
|
||||
"""Async extract_bytes call on PDF document."""
|
||||
content = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
mime_type = "application/pdf"
|
||||
|
||||
result = await extract_bytes(content, mime_type, None)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 50 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_async_extract_bytes_empty_mime() -> None:
|
||||
"""extract_bytes empty MIME async."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/plain.txt").read_bytes()
|
||||
mime_type = ""
|
||||
config = ExtractionConfig()
|
||||
await extract_bytes(content, mime_type, config)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_async_extract_bytes_invalid_mime() -> None:
|
||||
"""extract_bytes unsupported MIME async."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/plain.txt").read_bytes()
|
||||
mime_type = "application/x-nonexistent"
|
||||
config = ExtractionConfig()
|
||||
await extract_bytes(content, mime_type, config)
|
||||
109
e2e/python/tests/test_batch.py
generated
Normal file
109
e2e/python/tests/test_batch.py
generated
Normal file
@@ -0,0 +1,109 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: batch."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import batch_extract_bytes_sync, batch_extract_bytes, batch_extract_files, batch_extract_files_sync, BatchBytesItem, BatchFileItem
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_batch_bytes_invalid_mime() -> None:
|
||||
"""batch_extract_bytes_sync invalid MIME."""
|
||||
items = [BatchBytesItem(content=bytes([72, 101, 108, 108, 111]), mime_type="application/x-nonexistent")]
|
||||
|
||||
_ = batch_extract_bytes_sync(items, None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_batch_extract_bytes_happy() -> None:
|
||||
"""batch_extract_bytes: happy path with mixed inputs."""
|
||||
items = [BatchBytesItem(content=bytes([72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]), mime_type="text/plain"), BatchBytesItem(content=bytes([60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111, 100, 121, 62, 60, 47, 104, 116, 109, 108, 62]), mime_type="text/html")]
|
||||
|
||||
result = await batch_extract_bytes(items, None)
|
||||
assert len(result) >= 1 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_batch_extract_bytes_mixed_format() -> None:
|
||||
"""batch_extract_bytes: handles unsupported MIME gracefully."""
|
||||
items = [BatchBytesItem(content=bytes([80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114]), mime_type="application/x-unknown")]
|
||||
|
||||
_ = await batch_extract_bytes(items, None)
|
||||
|
||||
|
||||
def test_batch_extract_bytes_sync_empty_list() -> None:
|
||||
"""batch_extract_bytes_sync: empty batch."""
|
||||
items = []
|
||||
|
||||
result = batch_extract_bytes_sync(items, None)
|
||||
assert len(result) == 0 # noqa: S101
|
||||
|
||||
|
||||
def test_batch_extract_bytes_sync_invalid_mime() -> None:
|
||||
"""batch_extract_bytes_sync: unsupported MIME."""
|
||||
items = [BatchBytesItem(content=bytes([100, 97, 116, 97]), mime_type="application/x-unknown")]
|
||||
|
||||
_ = batch_extract_bytes_sync(items, None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_batch_file_async_basic() -> None:
|
||||
"""Extract text from multiple files asynchronously."""
|
||||
paths = [BatchFileItem(path="pdf/fake_memo.pdf"), BatchFileItem(path="text/fake_text.txt")]
|
||||
|
||||
_ = await batch_extract_files(paths, None)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_batch_file_async_not_found() -> None:
|
||||
"""batch_extract_file async nonexistent."""
|
||||
paths = [BatchFileItem(path="/nonexistent/a.pdf")]
|
||||
|
||||
_ = await batch_extract_files(paths, None)
|
||||
|
||||
|
||||
def test_batch_file_not_found() -> None:
|
||||
"""batch_extract_file_sync nonexistent."""
|
||||
paths = [BatchFileItem(path="/nonexistent/a.pdf"), BatchFileItem(path="/nonexistent/b.txt")]
|
||||
|
||||
_ = batch_extract_files_sync(paths, None)
|
||||
|
||||
|
||||
def test_batch_file_partial() -> None:
|
||||
"""batch_extract_file_sync mixed."""
|
||||
paths = [BatchFileItem(path="text/plain.txt"), BatchFileItem(path="/nonexistent/missing.pdf")]
|
||||
|
||||
_ = batch_extract_files_sync(paths, None)
|
||||
|
||||
|
||||
def test_batch_file_sync_basic() -> None:
|
||||
"""Extract text from multiple files synchronously."""
|
||||
paths = [BatchFileItem(path="pdf/fake_memo.pdf"), BatchFileItem(path="text/fake_text.txt")]
|
||||
|
||||
_ = batch_extract_files_sync(paths, None)
|
||||
41
e2e/python/tests/test_code.py
generated
Normal file
41
e2e/python/tests/test_code.py
generated
Normal file
@@ -0,0 +1,41 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: code."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_file_sync
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_code_shebang_detection() -> None:
|
||||
"""Test language detection from shebang line via bytes input."""
|
||||
path = "code/script.sh"
|
||||
mime_type = "text/x-source-code"
|
||||
|
||||
result = extract_file_sync(path, mime_type, None)
|
||||
assert result.mime_type.strip() == "text/x-source-code" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert "build" in result.content # noqa: S101
|
||||
assert "clean" in result.content # noqa: S101
|
||||
227
e2e/python/tests/test_contract.py
generated
Normal file
227
e2e/python/tests/test_contract.py
generated
Normal file
@@ -0,0 +1,227 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: contract."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_file, extract_file_sync, extract_bytes_sync, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_batch_bytes_async() -> None:
|
||||
"""Tests async batch bytes extraction API (batch_extract_bytes)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
|
||||
result = await extract_file(path, None, None)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_batch_bytes_with_configs_async() -> None:
|
||||
"""Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(output_format="markdown")
|
||||
|
||||
result = await extract_file(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_batch_file_async() -> None:
|
||||
"""Tests async batch file extraction API (batch_extract_file)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
|
||||
result = await extract_file(path, None, None)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_batch_file_with_configs_async() -> None:
|
||||
"""Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(output_format="markdown")
|
||||
|
||||
result = await extract_file(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_extract_bytes_async() -> None:
|
||||
"""Tests async bytes extraction API (extract_bytes)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
|
||||
result = await extract_file(path, None, None)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_api_extract_file_async() -> None:
|
||||
"""Tests async file extraction API (extract_file)."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
|
||||
result = await extract_file(path, None, None)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
|
||||
|
||||
|
||||
def test_config_chunking_prepend_heading_context() -> None:
|
||||
"""Tests markdown chunker prepends heading hierarchy to chunk content."""
|
||||
path = "markdown/extraction_test.md"
|
||||
config = ExtractionConfig(chunking={"chunker_type": "markdown", "max_chars": 300, "max_overlap": 50, "prepend_heading_context": True})
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'chunks' not available on result type
|
||||
assert all(c.content for c in (result.chunks or [])) # noqa: S101
|
||||
assert all(c.metadata and c.metadata.heading_context is not None for c in (result.chunks or [])) # noqa: S101
|
||||
assert bool((result.chunks or []) and (result.chunks[0].metadata and result.chunks[0].metadata.heading_context)) # noqa: S101
|
||||
|
||||
|
||||
def test_config_document_structure_with_headings() -> None:
|
||||
"""Tests document structure with DOCX heading-driven nesting."""
|
||||
path = "docx/fake.docx"
|
||||
config = ExtractionConfig(include_document_structure=True)
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: S101
|
||||
# skipped: field 'document' not available on result type
|
||||
# skipped: field 'document.nodes' not available on result type
|
||||
|
||||
|
||||
def test_config_element_types() -> None:
|
||||
"""Tests element-based result format with element type assertions on DOCX."""
|
||||
path = "docx/unit_test_headers.docx"
|
||||
config = ExtractionConfig(result_format="element_based")
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert any(v in result.mime_type for v in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]) # noqa: S101
|
||||
# skipped: field 'elements' not available on result type
|
||||
|
||||
|
||||
def test_config_extraction_timeout() -> None:
|
||||
"""Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(extraction_timeout_secs=300)
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
|
||||
|
||||
def test_config_keywords() -> None:
|
||||
"""Tests keyword extraction via YAKE algorithm."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(keywords={"algorithm": "yake", "max_keywords": 10})
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'keywords' not available on Python ExtractionResult
|
||||
# skipped: field 'keywords' not available on Python ExtractionResult
|
||||
|
||||
|
||||
def test_config_pages() -> None:
|
||||
"""Tests page extraction and page marker configuration."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(pages={"extract_pages": True, "insert_page_markers": True})
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["PAGE"]) # noqa: S101
|
||||
|
||||
|
||||
def test_config_quality_enabled() -> None:
|
||||
"""Tests quality scoring produces a score value in [0.0, 1.0]."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(enable_quality_processing=True)
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
# skipped: field 'quality_score' not available on result type
|
||||
|
||||
|
||||
def test_config_security_limits() -> None:
|
||||
"""Tests archive extraction with custom security limits."""
|
||||
path = "archives/documents.zip"
|
||||
config = ExtractionConfig(security_limits={"max_archive_size": 104857600, "max_compression_ratio": 50, "max_files_in_archive": 100})
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert any(v in result.mime_type for v in ["application/zip", "application/x-zip-compressed"]) # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
|
||||
|
||||
def test_config_tree_sitter() -> None:
|
||||
"""Tests tree-sitter configuration round-trip."""
|
||||
path = "code/hello.py"
|
||||
config = ExtractionConfig(tree_sitter={"groups": ["web"], "languages": ["python", "rust"], "process": {"comments": False, "diagnostics": False, "docstrings": False, "exports": True, "imports": True, "structure": True, "symbols": False}})
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "text/x-source-code" # noqa: S101
|
||||
assert len(result.content) >= 5 # noqa: S101
|
||||
|
||||
|
||||
def test_output_format_bytes_markdown() -> None:
|
||||
"""Tests markdown output format via bytes extraction API."""
|
||||
content = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
mime_type = "application/pdf"
|
||||
config = ExtractionConfig(output_format="markdown")
|
||||
|
||||
result = extract_bytes_sync(content, mime_type, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
|
||||
|
||||
def test_output_format_markdown() -> None:
|
||||
"""Tests Markdown output format."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
config = ExtractionConfig(output_format="markdown")
|
||||
|
||||
result = extract_file_sync(path, None, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
# skipped: field 'metadata.output_format' not available on result type
|
||||
58
e2e/python/tests/test_detection.py
generated
Normal file
58
e2e/python/tests/test_detection.py
generated
Normal file
@@ -0,0 +1,58 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: detection."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import detect_mime_type_from_bytes, get_extensions_for_mime
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_detect_mime_bytes_html() -> None:
|
||||
"""Detect HTML MIME from bytes."""
|
||||
content = Path("html/html.html").read_bytes()
|
||||
|
||||
_ = detect_mime_type_from_bytes(content)
|
||||
|
||||
|
||||
def test_detect_mime_bytes_pdf() -> None:
|
||||
"""Detect PDF MIME type from bytes."""
|
||||
content = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
|
||||
_ = detect_mime_type_from_bytes(content)
|
||||
|
||||
|
||||
def test_detect_mime_bytes_png() -> None:
|
||||
"""Detect PNG MIME type from bytes."""
|
||||
content = Path("images/test_hello_world.png").read_bytes()
|
||||
|
||||
_ = detect_mime_type_from_bytes(content)
|
||||
|
||||
|
||||
def test_get_extensions_unknown_mime() -> None:
|
||||
"""get_extensions unknown MIME."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
mime_type = "application/x-totally-unknown"
|
||||
get_extensions_for_mime(mime_type)
|
||||
39
e2e/python/tests/test_document_extractor_management.py
generated
Normal file
39
e2e/python/tests/test_document_extractor_management.py
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: document_extractor_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_document_extractors, list_document_extractors
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_document_extractors_clear() -> None:
|
||||
"""Clear all document extractors and verify list is empty."""
|
||||
_ = clear_document_extractors()
|
||||
|
||||
|
||||
def test_extractors_list() -> None:
|
||||
"""List all registered document extractors."""
|
||||
_ = list_document_extractors()
|
||||
59
e2e/python/tests/test_embed_async_pending.py
generated
Normal file
59
e2e/python/tests/test_embed_async_pending.py
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: embed_async_pending."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import embed_texts_async, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_embed_texts_async_empty_input() -> None:
|
||||
"""embed_texts_async: empty text list."""
|
||||
texts = []
|
||||
|
||||
result = await embed_texts_async(texts, None)
|
||||
assert len(result) == 0 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_embed_texts_async_happy() -> None:
|
||||
"""embed_texts_async: basic async embedding."""
|
||||
texts = ["First", "Second"]
|
||||
|
||||
result = await embed_texts_async(texts, None)
|
||||
assert len(result) >= 2 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_embed_texts_async_preset_switch() -> None:
|
||||
"""embed_texts_async: preset override."""
|
||||
texts = ["Text"]
|
||||
config = ExtractionConfig(model={"name": "balanced", "type": "preset"})
|
||||
|
||||
_ = await embed_texts_async(texts, config)
|
||||
37
e2e/python/tests/test_embed_extra.py
generated
Normal file
37
e2e/python/tests/test_embed_extra.py
generated
Normal file
@@ -0,0 +1,37 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: embed_extra."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import embed_texts, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_embed_texts_batch() -> None:
|
||||
"""Batch embed texts."""
|
||||
texts = ["Hello", "World"]
|
||||
config = ExtractionConfig(model={"name": "balanced", "type": "preset"})
|
||||
|
||||
_ = embed_texts(texts, config)
|
||||
39
e2e/python/tests/test_embedding_backend_management.py
generated
Normal file
39
e2e/python/tests/test_embedding_backend_management.py
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: embedding_backend_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_embedding_backends, list_embedding_backends
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_embedding_backends_clear() -> None:
|
||||
"""Clear all embedding backends and verify list is empty."""
|
||||
_ = clear_embedding_backends()
|
||||
|
||||
|
||||
def test_embedding_backends_list() -> None:
|
||||
"""List all registered embedding backends."""
|
||||
_ = list_embedding_backends()
|
||||
66
e2e/python/tests/test_embeddings.py
generated
Normal file
66
e2e/python/tests/test_embeddings.py
generated
Normal file
@@ -0,0 +1,66 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: embeddings."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import embed_texts, get_embedding_preset, list_embedding_presets, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_embed_texts_different_preset() -> None:
|
||||
"""embed_texts: multilingual preset."""
|
||||
texts = ["Hello world", "Test"]
|
||||
config = ExtractionConfig(model={"name": "multilingual", "type": "preset"})
|
||||
|
||||
result = embed_texts(texts, config)
|
||||
assert len(result) >= 2 # noqa: S101
|
||||
|
||||
|
||||
def test_get_embedding_preset_known() -> None:
|
||||
"""get_embedding_preset: known preset."""
|
||||
name = "balanced"
|
||||
|
||||
_ = get_embedding_preset(name)
|
||||
|
||||
|
||||
def test_get_embedding_preset_nominal() -> None:
|
||||
"""get_embedding_preset: nominal case."""
|
||||
name = "balanced"
|
||||
|
||||
_ = get_embedding_preset(name)
|
||||
|
||||
|
||||
def test_get_embedding_preset_unknown() -> None:
|
||||
"""get_embedding_preset: unknown preset fails."""
|
||||
name = "nonexistent-xyz"
|
||||
|
||||
result = get_embedding_preset(name)
|
||||
assert not result # noqa: S101
|
||||
|
||||
|
||||
def test_list_embedding_presets_sanity() -> None:
|
||||
"""list_embedding_presets: returns at least one."""
|
||||
result = list_embedding_presets()
|
||||
assert result # noqa: S101
|
||||
75
e2e/python/tests/test_error.py
generated
Normal file
75
e2e/python/tests/test_error.py
generated
Normal file
@@ -0,0 +1,75 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: error."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_bytes_sync, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_error_empty_bytes() -> None:
|
||||
"""Graceful handling of empty bytes (should not error)."""
|
||||
content = Path("text/empty.txt").read_bytes()
|
||||
mime_type = "text/plain"
|
||||
config = ExtractionConfig()
|
||||
|
||||
_ = extract_bytes_sync(content, mime_type, config)
|
||||
|
||||
|
||||
def test_error_empty_mime() -> None:
|
||||
"""Error when extracting with empty MIME type."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/plain.txt").read_bytes()
|
||||
mime_type = ""
|
||||
config = ExtractionConfig()
|
||||
extract_bytes_sync(content, mime_type, config)
|
||||
|
||||
|
||||
def test_error_extract_bytes_conflicting_ocr() -> None:
|
||||
"""extract_bytes force+disable OCR."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/fake_text.txt").read_bytes()
|
||||
mime_type = "text/plain"
|
||||
config = ExtractionConfig(disable_ocr=True, force_ocr=True)
|
||||
extract_bytes_sync(content, mime_type, config)
|
||||
|
||||
|
||||
def test_error_invalid_mime_format() -> None:
|
||||
"""Error when extracting with invalid MIME type format."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/plain.txt").read_bytes()
|
||||
mime_type = "not-a-mime"
|
||||
config = ExtractionConfig()
|
||||
extract_bytes_sync(content, mime_type, config)
|
||||
|
||||
|
||||
def test_error_unsupported_mime() -> None:
|
||||
"""Error when extracting with unsupported MIME type."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
content = Path("text/plain.txt").read_bytes()
|
||||
mime_type = "application/x-nonexistent"
|
||||
config = ExtractionConfig()
|
||||
extract_bytes_sync(content, mime_type, config)
|
||||
75
e2e/python/tests/test_format_specific.py
generated
Normal file
75
e2e/python/tests/test_format_specific.py
generated
Normal file
@@ -0,0 +1,75 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: format_specific."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_bytes_sync, extract_file_sync
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_format_docx_standalone() -> None:
|
||||
"""Standalone DOCX extraction using extract_bytes_sync."""
|
||||
content = Path("docx/fake.docx").read_bytes()
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
|
||||
result = extract_bytes_sync(content, mime_type, None)
|
||||
assert len(result.content) >= 20 # noqa: S101
|
||||
|
||||
|
||||
def test_format_hwpx_standalone() -> None:
|
||||
"""Standalone HWPX extraction using extract_bytes_sync."""
|
||||
content = Path("hwpx/simple.hwpx").read_bytes()
|
||||
mime_type = "application/haansofthwpx"
|
||||
|
||||
result = extract_bytes_sync(content, mime_type, None)
|
||||
assert len(result.content) >= 20 # noqa: S101
|
||||
assert "Hello from HWPX" in result.content # noqa: S101
|
||||
|
||||
|
||||
def test_format_pdf_text() -> None:
|
||||
"""Standalone PDF text extraction using extract_bytes_sync."""
|
||||
content = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
mime_type = "application/pdf"
|
||||
|
||||
result = extract_bytes_sync(content, mime_type, None)
|
||||
assert len(result.content) >= 50 # noqa: S101
|
||||
assert any(v in result.content for v in ["Mallori", "May"]) # noqa: S101
|
||||
|
||||
|
||||
def test_format_pptx() -> None:
|
||||
"""PPTX presentation extraction using extract_file_sync."""
|
||||
path = "pptx/simple.pptx"
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
|
||||
_ = extract_file_sync(path, mime_type, None)
|
||||
|
||||
|
||||
def test_format_xlsx() -> None:
|
||||
"""XLSX spreadsheet extraction using extract_file_sync."""
|
||||
path = "xlsx/stanley_cups.xlsx"
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
|
||||
_ = extract_file_sync(path, mime_type, None)
|
||||
54
e2e/python/tests/test_mime_utilities.py
generated
Normal file
54
e2e/python/tests/test_mime_utilities.py
generated
Normal file
@@ -0,0 +1,54 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: mime_utilities."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import detect_mime_type_from_bytes, get_extensions_for_mime
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_mime_detect_bytes() -> None:
|
||||
"""Detect MIME type from file bytes."""
|
||||
content = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
|
||||
result = detect_mime_type_from_bytes(content)
|
||||
assert "pdf" in result # noqa: S101
|
||||
|
||||
|
||||
def test_mime_detect_image() -> None:
|
||||
"""Detect MIME type from PNG image bytes."""
|
||||
content = Path("images/test_hello_world.png").read_bytes()
|
||||
|
||||
result = detect_mime_type_from_bytes(content)
|
||||
assert "png" in result # noqa: S101
|
||||
|
||||
|
||||
def test_mime_get_extensions() -> None:
|
||||
"""Get file extensions for a MIME type."""
|
||||
mime_type = "application/pdf"
|
||||
|
||||
result = get_extensions_for_mime(mime_type)
|
||||
assert "pdf" in result # noqa: S101
|
||||
46
e2e/python/tests/test_ocr_backend_management.py
generated
Normal file
46
e2e/python/tests/test_ocr_backend_management.py
generated
Normal file
@@ -0,0 +1,46 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: ocr_backend_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_ocr_backends, list_ocr_backends, unregister_ocr_backend
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_ocr_backends_clear() -> None:
|
||||
"""Clear all OCR backends and verify list is empty."""
|
||||
_ = clear_ocr_backends()
|
||||
|
||||
|
||||
def test_ocr_backends_list() -> None:
|
||||
"""List all registered OCR backends."""
|
||||
_ = list_ocr_backends()
|
||||
|
||||
|
||||
def test_ocr_backends_unregister() -> None:
|
||||
"""Unregister nonexistent OCR backend gracefully."""
|
||||
name = "nonexistent-backend-xyz"
|
||||
|
||||
_ = unregister_ocr_backend(name)
|
||||
47
e2e/python/tests/test_pdf.py
generated
Normal file
47
e2e/python/tests/test_pdf.py
generated
Normal file
@@ -0,0 +1,47 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: pdf."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import render_pdf_page_to_png
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_render_pdf_page_first() -> None:
|
||||
"""render_pdf_page_to_png: first page."""
|
||||
pdf_bytes = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
page_index = 0
|
||||
|
||||
result = render_pdf_page_to_png(pdf_bytes, page_index)
|
||||
assert len(result) >= 100 # noqa: S101
|
||||
|
||||
|
||||
def test_render_pdf_page_out_of_range() -> None:
|
||||
"""render_pdf_page_to_png: page out of range."""
|
||||
with pytest.raises(Exception): # noqa: B017
|
||||
pdf_bytes = Path("pdf/fake_memo.pdf").read_bytes()
|
||||
page_index = 999
|
||||
render_pdf_page_to_png(pdf_bytes, page_index)
|
||||
176
e2e/python/tests/test_plugin_api.py
generated
Normal file
176
e2e/python/tests/test_plugin_api.py
generated
Normal file
@@ -0,0 +1,176 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: plugin_api."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import register_document_extractor, register_embedding_backend, register_ocr_backend, register_post_processor, register_renderer, register_validator, unregister_document_extractor, unregister_embedding_backend, unregister_post_processor, unregister_renderer, unregister_validator, unregister_ocr_backend
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_register_document_extractor_trait_bridge() -> None:
|
||||
"""register_document_extractor: trait bridge."""
|
||||
class _TestStub_register_document_extractor_trait_bridge:
|
||||
def name(self):
|
||||
return "test-extractor"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
async def extract_bytes(self, _p0, _p1, _p2):
|
||||
return {}
|
||||
def supported_mime_types(self):
|
||||
return []
|
||||
|
||||
|
||||
_ = register_document_extractor(_TestStub_register_document_extractor_trait_bridge())
|
||||
unregister_document_extractor("test-extractor")
|
||||
|
||||
|
||||
def test_register_embedding_backend_trait_bridge() -> None:
|
||||
"""register_embedding_backend: trait bridge."""
|
||||
class _TestStub_register_embedding_backend_trait_bridge:
|
||||
def name(self):
|
||||
return "test-embedding-backend"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
def dimensions(self):
|
||||
return 1
|
||||
async def embed(self, _p0):
|
||||
return []
|
||||
|
||||
|
||||
_ = register_embedding_backend(_TestStub_register_embedding_backend_trait_bridge())
|
||||
unregister_embedding_backend("test-embedding-backend")
|
||||
|
||||
|
||||
def test_register_ocr_backend_trait_bridge() -> None:
|
||||
"""register_ocr_backend: trait bridge."""
|
||||
class _TestStub_register_ocr_backend_trait_bridge:
|
||||
def name(self):
|
||||
return "test-backend"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
async def process_image(self, _p0, _p1):
|
||||
return {}
|
||||
def supports_language(self, _p0):
|
||||
return False
|
||||
def backend_type(self):
|
||||
return {}
|
||||
|
||||
|
||||
_ = register_ocr_backend(_TestStub_register_ocr_backend_trait_bridge())
|
||||
unregister_ocr_backend("test-backend")
|
||||
|
||||
|
||||
def test_register_post_processor_trait_bridge() -> None:
|
||||
"""register_post_processor: trait bridge."""
|
||||
class _TestStub_register_post_processor_trait_bridge:
|
||||
def name(self):
|
||||
return "test-processor"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
async def process(self, _p0, _p1):
|
||||
return None
|
||||
def processing_stage(self):
|
||||
return {}
|
||||
|
||||
|
||||
_ = register_post_processor(_TestStub_register_post_processor_trait_bridge())
|
||||
unregister_post_processor("test-processor")
|
||||
|
||||
|
||||
def test_register_renderer_trait_bridge() -> None:
|
||||
"""register_renderer: trait bridge."""
|
||||
class _TestStub_register_renderer_trait_bridge:
|
||||
def name(self):
|
||||
return "test-renderer"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
def render(self, _p0):
|
||||
return ""
|
||||
|
||||
|
||||
_ = register_renderer(_TestStub_register_renderer_trait_bridge())
|
||||
unregister_renderer("test-renderer")
|
||||
|
||||
|
||||
def test_register_validator_trait_bridge() -> None:
|
||||
"""register_validator: trait bridge."""
|
||||
class _TestStub_register_validator_trait_bridge:
|
||||
def name(self):
|
||||
return "test-validator"
|
||||
def initialize(self):
|
||||
pass
|
||||
def shutdown(self):
|
||||
pass
|
||||
async def validate(self, _p0, _p1):
|
||||
return None
|
||||
|
||||
|
||||
_ = register_validator(_TestStub_register_validator_trait_bridge())
|
||||
unregister_validator("test-validator")
|
||||
|
||||
|
||||
def test_unregister_document_extractor_after_register() -> None:
|
||||
"""unregister_document_extractor."""
|
||||
name = "test-extractor"
|
||||
|
||||
_ = unregister_document_extractor(name)
|
||||
|
||||
|
||||
def test_unregister_embedding_backend_after_register() -> None:
|
||||
"""unregister_embedding_backend."""
|
||||
name = "test-embedding-backend"
|
||||
|
||||
_ = unregister_embedding_backend(name)
|
||||
|
||||
|
||||
def test_unregister_post_processor_after_register() -> None:
|
||||
"""unregister_post_processor."""
|
||||
name = "test-processor"
|
||||
|
||||
_ = unregister_post_processor(name)
|
||||
|
||||
|
||||
def test_unregister_renderer_after_register() -> None:
|
||||
"""unregister_renderer."""
|
||||
name = "test-renderer"
|
||||
|
||||
_ = unregister_renderer(name)
|
||||
|
||||
|
||||
def test_unregister_validator_after_register() -> None:
|
||||
"""unregister_validator."""
|
||||
name = "test-validator"
|
||||
|
||||
_ = unregister_validator(name)
|
||||
39
e2e/python/tests/test_post_processor_management.py
generated
Normal file
39
e2e/python/tests/test_post_processor_management.py
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: post_processor_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_post_processors, list_post_processors
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_post_processors_clear() -> None:
|
||||
"""Clear all post-processors and verify list is empty."""
|
||||
_ = clear_post_processors()
|
||||
|
||||
|
||||
def test_post_processors_list() -> None:
|
||||
"""List all registered post-processors."""
|
||||
_ = list_post_processors()
|
||||
59
e2e/python/tests/test_registry.py
generated
Normal file
59
e2e/python/tests/test_registry.py
generated
Normal file
@@ -0,0 +1,59 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: registry."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import list_document_extractors, list_embedding_backends, list_ocr_backends, list_post_processors, list_renderers, list_validators
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_list_document_extractors() -> None:
|
||||
"""List document extractors."""
|
||||
_ = list_document_extractors()
|
||||
|
||||
|
||||
def test_list_embedding_backends() -> None:
|
||||
"""List embedding backends."""
|
||||
_ = list_embedding_backends()
|
||||
|
||||
|
||||
def test_list_ocr_backends() -> None:
|
||||
"""List OCR backends."""
|
||||
_ = list_ocr_backends()
|
||||
|
||||
|
||||
def test_list_post_processors() -> None:
|
||||
"""List post-processors."""
|
||||
_ = list_post_processors()
|
||||
|
||||
|
||||
def test_list_renderers() -> None:
|
||||
"""List renderers."""
|
||||
_ = list_renderers()
|
||||
|
||||
|
||||
def test_list_validators() -> None:
|
||||
"""List validators."""
|
||||
_ = list_validators()
|
||||
50
e2e/python/tests/test_registry_operations.py
generated
Normal file
50
e2e/python/tests/test_registry_operations.py
generated
Normal file
@@ -0,0 +1,50 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: registry_operations."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import get_extensions_for_mime
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_extensions_docx() -> None:
|
||||
"""Get file extensions for DOCX MIME type."""
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
|
||||
_ = get_extensions_for_mime(mime_type)
|
||||
|
||||
|
||||
def test_extensions_html() -> None:
|
||||
"""Get file extensions for HTML MIME type."""
|
||||
mime_type = "text/html"
|
||||
|
||||
_ = get_extensions_for_mime(mime_type)
|
||||
|
||||
|
||||
def test_extensions_pdf() -> None:
|
||||
"""Get file extensions for PDF MIME type."""
|
||||
mime_type = "application/pdf"
|
||||
|
||||
_ = get_extensions_for_mime(mime_type)
|
||||
39
e2e/python/tests/test_renderer_management.py
generated
Normal file
39
e2e/python/tests/test_renderer_management.py
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: renderer_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_renderers, list_renderers
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_renderers_clear() -> None:
|
||||
"""Clear all renderers and verify list is empty."""
|
||||
_ = clear_renderers()
|
||||
|
||||
|
||||
def test_renderers_list() -> None:
|
||||
"""List all registered renderers."""
|
||||
_ = list_renderers()
|
||||
148
e2e/python/tests/test_smoke.py
generated
Normal file
148
e2e/python/tests/test_smoke.py
generated
Normal file
@@ -0,0 +1,148 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: smoke."""
|
||||
|
||||
from pathlib import Path
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import extract_bytes, extract_file, ExtractionConfig
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_ocr_image_png() -> None:
|
||||
"""OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge."""
|
||||
content = Path("images/test_hello_world.png").read_bytes()
|
||||
mime_type = "image/png"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_bytes(content, mime_type, config)
|
||||
assert result.mime_type.strip() == "image/png" # noqa: S101
|
||||
assert len(result.content) >= 1 # noqa: S101
|
||||
assert any(v in result.content for v in ["Hello", "World", "hello", "world"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_docx_basic() -> None:
|
||||
"""Smoke test: DOCX with formatted text."""
|
||||
path = "docx/fake.docx"
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: S101
|
||||
assert len(result.content) >= 20 # noqa: S101
|
||||
assert any(v in result.content for v in ["Lorem", "ipsum", "document", "text"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_html_basic() -> None:
|
||||
"""Smoke test: HTML table extraction."""
|
||||
path = "html/simple_table.html"
|
||||
mime_type = "text/html"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "text/html" # noqa: S101
|
||||
assert len(result.content) >= 10 # noqa: S101
|
||||
assert any(v in result.content for v in ["Sample Data Table", "Laptop", "Electronics", "Product"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_image_png() -> None:
|
||||
"""Smoke test: PNG image (without OCR, metadata only)."""
|
||||
path = "images/sample.png"
|
||||
config = ExtractionConfig(disable_ocr=True)
|
||||
|
||||
result = await extract_file(path, None, config)
|
||||
assert result.mime_type.strip() == "image/png" # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_json_basic() -> None:
|
||||
"""Smoke test: JSON file extraction."""
|
||||
path = "json/simple.json"
|
||||
mime_type = "application/json"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "application/json" # noqa: S101
|
||||
assert len(result.content) >= 5 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_pdf_basic() -> None:
|
||||
"""Smoke test: PDF with simple text extraction."""
|
||||
path = "pdf/fake_memo.pdf"
|
||||
mime_type = "application/pdf"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "application/pdf" # noqa: S101
|
||||
assert len(result.content) >= 50 # noqa: S101
|
||||
assert any(v in result.content for v in ["May 5, 2023", "To Whom it May Concern"]) # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_txt_basic() -> None:
|
||||
"""Smoke test: Plain text file."""
|
||||
path = "text/report.txt"
|
||||
mime_type = "text/plain"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "text/plain" # noqa: S101
|
||||
assert len(result.content) >= 5 # noqa: S101
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
async def test_smoke_xlsx_basic() -> None:
|
||||
"""Smoke test: XLSX with basic spreadsheet data including tables."""
|
||||
path = "xlsx/stanley_cups.xlsx"
|
||||
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
config = ExtractionConfig()
|
||||
|
||||
result = await extract_file(path, mime_type, config)
|
||||
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # noqa: S101
|
||||
assert len(result.content) >= 100 # noqa: S101
|
||||
assert "Team" in result.content # noqa: S101
|
||||
assert "Location" in result.content # noqa: S101
|
||||
assert "Stanley Cups" in result.content # noqa: S101
|
||||
assert "Blues" in result.content # noqa: S101
|
||||
assert "Flyers" in result.content # noqa: S101
|
||||
assert "Maple Leafs" in result.content # noqa: S101
|
||||
assert "STL" in result.content # noqa: S101
|
||||
assert "PHI" in result.content # noqa: S101
|
||||
assert "TOR" in result.content # noqa: S101
|
||||
# skipped: field 'tables' not available on result type
|
||||
# skipped: field 'metadata.format.excel.sheet_count' not available on result type
|
||||
# skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
||||
39
e2e/python/tests/test_validator_management.py
generated
Normal file
39
e2e/python/tests/test_validator_management.py
generated
Normal file
@@ -0,0 +1,39 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
"""E2e tests for category: validator_management."""
|
||||
|
||||
import pytest # noqa: F401
|
||||
from kreuzberg import clear_validators, list_validators
|
||||
|
||||
|
||||
def _alef_e2e_text(value: object) -> str:
|
||||
return "" if value is None else str(value)
|
||||
|
||||
|
||||
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
|
||||
raw_items = getattr(item, "items", None)
|
||||
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
|
||||
return (
|
||||
_alef_e2e_text(item),
|
||||
_alef_e2e_text(getattr(item, "kind", None)),
|
||||
_alef_e2e_text(getattr(item, "name", None)),
|
||||
_alef_e2e_text(getattr(item, "source", None)),
|
||||
_alef_e2e_text(getattr(item, "alias", None)),
|
||||
_alef_e2e_text(getattr(item, "text", None)),
|
||||
_alef_e2e_text(getattr(item, "signature", None)),
|
||||
items_text,
|
||||
)
|
||||
|
||||
|
||||
def test_validators_clear() -> None:
|
||||
"""Clear all validators and verify list is empty."""
|
||||
_ = clear_validators()
|
||||
|
||||
|
||||
def test_validators_list() -> None:
|
||||
"""List all registered validators."""
|
||||
_ = list_validators()
|
||||
Reference in New Issue
Block a user