Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

1
e2e/python/tests/__init__.py generated Normal file
View File

@@ -0,0 +1 @@

64
e2e/python/tests/test_async.py generated Normal file
View File

@@ -0,0 +1,64 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: async."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import extract_bytes, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
@pytest.mark.asyncio
async def test_async_extract_bytes() -> None:
"""Async extract_bytes call on PDF document."""
content = Path("pdf/fake_memo.pdf").read_bytes()
mime_type = "application/pdf"
result = await extract_bytes(content, mime_type, None)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 50 # noqa: S101
@pytest.mark.asyncio
async def test_async_extract_bytes_empty_mime() -> None:
"""extract_bytes empty MIME async."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/plain.txt").read_bytes()
mime_type = ""
config = ExtractionConfig()
await extract_bytes(content, mime_type, config)
@pytest.mark.asyncio
async def test_async_extract_bytes_invalid_mime() -> None:
"""extract_bytes unsupported MIME async."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/plain.txt").read_bytes()
mime_type = "application/x-nonexistent"
config = ExtractionConfig()
await extract_bytes(content, mime_type, config)

109
e2e/python/tests/test_batch.py generated Normal file
View File

@@ -0,0 +1,109 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: batch."""
import pytest # noqa: F401
from kreuzberg import batch_extract_bytes_sync, batch_extract_bytes, batch_extract_files, batch_extract_files_sync, BatchBytesItem, BatchFileItem
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_batch_bytes_invalid_mime() -> None:
"""batch_extract_bytes_sync invalid MIME."""
items = [BatchBytesItem(content=bytes([72, 101, 108, 108, 111]), mime_type="application/x-nonexistent")]
_ = batch_extract_bytes_sync(items, None)
@pytest.mark.asyncio
async def test_batch_extract_bytes_happy() -> None:
"""batch_extract_bytes: happy path with mixed inputs."""
items = [BatchBytesItem(content=bytes([72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33]), mime_type="text/plain"), BatchBytesItem(content=bytes([60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111, 100, 121, 62, 60, 47, 104, 116, 109, 108, 62]), mime_type="text/html")]
result = await batch_extract_bytes(items, None)
assert len(result) >= 1 # noqa: S101
@pytest.mark.asyncio
async def test_batch_extract_bytes_mixed_format() -> None:
"""batch_extract_bytes: handles unsupported MIME gracefully."""
items = [BatchBytesItem(content=bytes([80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114]), mime_type="application/x-unknown")]
_ = await batch_extract_bytes(items, None)
def test_batch_extract_bytes_sync_empty_list() -> None:
"""batch_extract_bytes_sync: empty batch."""
items = []
result = batch_extract_bytes_sync(items, None)
assert len(result) == 0 # noqa: S101
def test_batch_extract_bytes_sync_invalid_mime() -> None:
"""batch_extract_bytes_sync: unsupported MIME."""
items = [BatchBytesItem(content=bytes([100, 97, 116, 97]), mime_type="application/x-unknown")]
_ = batch_extract_bytes_sync(items, None)
@pytest.mark.asyncio
async def test_batch_file_async_basic() -> None:
"""Extract text from multiple files asynchronously."""
paths = [BatchFileItem(path="pdf/fake_memo.pdf"), BatchFileItem(path="text/fake_text.txt")]
_ = await batch_extract_files(paths, None)
@pytest.mark.asyncio
async def test_batch_file_async_not_found() -> None:
"""batch_extract_file async nonexistent."""
paths = [BatchFileItem(path="/nonexistent/a.pdf")]
_ = await batch_extract_files(paths, None)
def test_batch_file_not_found() -> None:
"""batch_extract_file_sync nonexistent."""
paths = [BatchFileItem(path="/nonexistent/a.pdf"), BatchFileItem(path="/nonexistent/b.txt")]
_ = batch_extract_files_sync(paths, None)
def test_batch_file_partial() -> None:
"""batch_extract_file_sync mixed."""
paths = [BatchFileItem(path="text/plain.txt"), BatchFileItem(path="/nonexistent/missing.pdf")]
_ = batch_extract_files_sync(paths, None)
def test_batch_file_sync_basic() -> None:
"""Extract text from multiple files synchronously."""
paths = [BatchFileItem(path="pdf/fake_memo.pdf"), BatchFileItem(path="text/fake_text.txt")]
_ = batch_extract_files_sync(paths, None)

41
e2e/python/tests/test_code.py generated Normal file
View File

@@ -0,0 +1,41 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: code."""
import pytest # noqa: F401
from kreuzberg import extract_file_sync
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_code_shebang_detection() -> None:
"""Test language detection from shebang line via bytes input."""
path = "code/script.sh"
mime_type = "text/x-source-code"
result = extract_file_sync(path, mime_type, None)
assert result.mime_type.strip() == "text/x-source-code" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert "build" in result.content # noqa: S101
assert "clean" in result.content # noqa: S101

227
e2e/python/tests/test_contract.py generated Normal file
View File

@@ -0,0 +1,227 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: contract."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import extract_file, extract_file_sync, extract_bytes_sync, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
@pytest.mark.asyncio
async def test_api_batch_bytes_async() -> None:
"""Tests async batch bytes extraction API (batch_extract_bytes)."""
path = "pdf/fake_memo.pdf"
result = await extract_file(path, None, None)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
@pytest.mark.asyncio
async def test_api_batch_bytes_with_configs_async() -> None:
"""Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(output_format="markdown")
result = await extract_file(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'metadata.output_format' not available on result type
@pytest.mark.asyncio
async def test_api_batch_file_async() -> None:
"""Tests async batch file extraction API (batch_extract_file)."""
path = "pdf/fake_memo.pdf"
result = await extract_file(path, None, None)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
@pytest.mark.asyncio
async def test_api_batch_file_with_configs_async() -> None:
"""Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(output_format="markdown")
result = await extract_file(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'metadata.output_format' not available on result type
@pytest.mark.asyncio
async def test_api_extract_bytes_async() -> None:
"""Tests async bytes extraction API (extract_bytes)."""
path = "pdf/fake_memo.pdf"
result = await extract_file(path, None, None)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
@pytest.mark.asyncio
async def test_api_extract_file_async() -> None:
"""Tests async file extraction API (extract_file)."""
path = "pdf/fake_memo.pdf"
result = await extract_file(path, None, None)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["May 5, 2023", "Mallori"]) # noqa: S101
def test_config_chunking_prepend_heading_context() -> None:
"""Tests markdown chunker prepends heading hierarchy to chunk content."""
path = "markdown/extraction_test.md"
config = ExtractionConfig(chunking={"chunker_type": "markdown", "max_chars": 300, "max_overlap": 50, "prepend_heading_context": True})
result = extract_file_sync(path, None, config)
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'chunks' not available on result type
assert all(c.content for c in (result.chunks or [])) # noqa: S101
assert all(c.metadata and c.metadata.heading_context is not None for c in (result.chunks or [])) # noqa: S101
assert bool((result.chunks or []) and (result.chunks[0].metadata and result.chunks[0].metadata.heading_context)) # noqa: S101
def test_config_document_structure_with_headings() -> None:
"""Tests document structure with DOCX heading-driven nesting."""
path = "docx/fake.docx"
config = ExtractionConfig(include_document_structure=True)
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: S101
# skipped: field 'document' not available on result type
# skipped: field 'document.nodes' not available on result type
def test_config_element_types() -> None:
"""Tests element-based result format with element type assertions on DOCX."""
path = "docx/unit_test_headers.docx"
config = ExtractionConfig(result_format="element_based")
result = extract_file_sync(path, None, config)
assert any(v in result.mime_type for v in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]) # noqa: S101
# skipped: field 'elements' not available on result type
def test_config_extraction_timeout() -> None:
"""Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(extraction_timeout_secs=300)
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
def test_config_keywords() -> None:
"""Tests keyword extraction via YAKE algorithm."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(keywords={"algorithm": "yake", "max_keywords": 10})
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'keywords' not available on Python ExtractionResult
# skipped: field 'keywords' not available on Python ExtractionResult
def test_config_pages() -> None:
"""Tests page extraction and page marker configuration."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(pages={"extract_pages": True, "insert_page_markers": True})
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["PAGE"]) # noqa: S101
def test_config_quality_enabled() -> None:
"""Tests quality scoring produces a score value in [0.0, 1.0]."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(enable_quality_processing=True)
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
# skipped: field 'quality_score' not available on result type
def test_config_security_limits() -> None:
"""Tests archive extraction with custom security limits."""
path = "archives/documents.zip"
config = ExtractionConfig(security_limits={"max_archive_size": 104857600, "max_compression_ratio": 50, "max_files_in_archive": 100})
result = extract_file_sync(path, None, config)
assert any(v in result.mime_type for v in ["application/zip", "application/x-zip-compressed"]) # noqa: S101
assert len(result.content) >= 10 # noqa: S101
def test_config_tree_sitter() -> None:
"""Tests tree-sitter configuration round-trip."""
path = "code/hello.py"
config = ExtractionConfig(tree_sitter={"groups": ["web"], "languages": ["python", "rust"], "process": {"comments": False, "diagnostics": False, "docstrings": False, "exports": True, "imports": True, "structure": True, "symbols": False}})
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "text/x-source-code" # noqa: S101
assert len(result.content) >= 5 # noqa: S101
def test_output_format_bytes_markdown() -> None:
"""Tests markdown output format via bytes extraction API."""
content = Path("pdf/fake_memo.pdf").read_bytes()
mime_type = "application/pdf"
config = ExtractionConfig(output_format="markdown")
result = extract_bytes_sync(content, mime_type, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'metadata.output_format' not available on result type
def test_output_format_markdown() -> None:
"""Tests Markdown output format."""
path = "pdf/fake_memo.pdf"
config = ExtractionConfig(output_format="markdown")
result = extract_file_sync(path, None, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
# skipped: field 'metadata.output_format' not available on result type

58
e2e/python/tests/test_detection.py generated Normal file
View File

@@ -0,0 +1,58 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: detection."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import detect_mime_type_from_bytes, get_extensions_for_mime
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_detect_mime_bytes_html() -> None:
"""Detect HTML MIME from bytes."""
content = Path("html/html.html").read_bytes()
_ = detect_mime_type_from_bytes(content)
def test_detect_mime_bytes_pdf() -> None:
"""Detect PDF MIME type from bytes."""
content = Path("pdf/fake_memo.pdf").read_bytes()
_ = detect_mime_type_from_bytes(content)
def test_detect_mime_bytes_png() -> None:
"""Detect PNG MIME type from bytes."""
content = Path("images/test_hello_world.png").read_bytes()
_ = detect_mime_type_from_bytes(content)
def test_get_extensions_unknown_mime() -> None:
"""get_extensions unknown MIME."""
with pytest.raises(Exception): # noqa: B017
mime_type = "application/x-totally-unknown"
get_extensions_for_mime(mime_type)

View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: document_extractor_management."""
import pytest # noqa: F401
from kreuzberg import clear_document_extractors, list_document_extractors
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_document_extractors_clear() -> None:
"""Clear all document extractors and verify list is empty."""
_ = clear_document_extractors()
def test_extractors_list() -> None:
"""List all registered document extractors."""
_ = list_document_extractors()

View File

@@ -0,0 +1,59 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: embed_async_pending."""
import pytest # noqa: F401
from kreuzberg import embed_texts_async, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
@pytest.mark.asyncio
async def test_embed_texts_async_empty_input() -> None:
"""embed_texts_async: empty text list."""
texts = []
result = await embed_texts_async(texts, None)
assert len(result) == 0 # noqa: S101
@pytest.mark.asyncio
async def test_embed_texts_async_happy() -> None:
"""embed_texts_async: basic async embedding."""
texts = ["First", "Second"]
result = await embed_texts_async(texts, None)
assert len(result) >= 2 # noqa: S101
@pytest.mark.asyncio
async def test_embed_texts_async_preset_switch() -> None:
"""embed_texts_async: preset override."""
texts = ["Text"]
config = ExtractionConfig(model={"name": "balanced", "type": "preset"})
_ = await embed_texts_async(texts, config)

37
e2e/python/tests/test_embed_extra.py generated Normal file
View File

@@ -0,0 +1,37 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: embed_extra."""
import pytest # noqa: F401
from kreuzberg import embed_texts, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_embed_texts_batch() -> None:
"""Batch embed texts."""
texts = ["Hello", "World"]
config = ExtractionConfig(model={"name": "balanced", "type": "preset"})
_ = embed_texts(texts, config)

View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: embedding_backend_management."""
import pytest # noqa: F401
from kreuzberg import clear_embedding_backends, list_embedding_backends
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_embedding_backends_clear() -> None:
"""Clear all embedding backends and verify list is empty."""
_ = clear_embedding_backends()
def test_embedding_backends_list() -> None:
"""List all registered embedding backends."""
_ = list_embedding_backends()

66
e2e/python/tests/test_embeddings.py generated Normal file
View File

@@ -0,0 +1,66 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: embeddings."""
import pytest # noqa: F401
from kreuzberg import embed_texts, get_embedding_preset, list_embedding_presets, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_embed_texts_different_preset() -> None:
"""embed_texts: multilingual preset."""
texts = ["Hello world", "Test"]
config = ExtractionConfig(model={"name": "multilingual", "type": "preset"})
result = embed_texts(texts, config)
assert len(result) >= 2 # noqa: S101
def test_get_embedding_preset_known() -> None:
"""get_embedding_preset: known preset."""
name = "balanced"
_ = get_embedding_preset(name)
def test_get_embedding_preset_nominal() -> None:
"""get_embedding_preset: nominal case."""
name = "balanced"
_ = get_embedding_preset(name)
def test_get_embedding_preset_unknown() -> None:
"""get_embedding_preset: unknown preset fails."""
name = "nonexistent-xyz"
result = get_embedding_preset(name)
assert not result # noqa: S101
def test_list_embedding_presets_sanity() -> None:
"""list_embedding_presets: returns at least one."""
result = list_embedding_presets()
assert result # noqa: S101

75
e2e/python/tests/test_error.py generated Normal file
View File

@@ -0,0 +1,75 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: error."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import extract_bytes_sync, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_error_empty_bytes() -> None:
"""Graceful handling of empty bytes (should not error)."""
content = Path("text/empty.txt").read_bytes()
mime_type = "text/plain"
config = ExtractionConfig()
_ = extract_bytes_sync(content, mime_type, config)
def test_error_empty_mime() -> None:
"""Error when extracting with empty MIME type."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/plain.txt").read_bytes()
mime_type = ""
config = ExtractionConfig()
extract_bytes_sync(content, mime_type, config)
def test_error_extract_bytes_conflicting_ocr() -> None:
"""extract_bytes force+disable OCR."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/fake_text.txt").read_bytes()
mime_type = "text/plain"
config = ExtractionConfig(disable_ocr=True, force_ocr=True)
extract_bytes_sync(content, mime_type, config)
def test_error_invalid_mime_format() -> None:
"""Error when extracting with invalid MIME type format."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/plain.txt").read_bytes()
mime_type = "not-a-mime"
config = ExtractionConfig()
extract_bytes_sync(content, mime_type, config)
def test_error_unsupported_mime() -> None:
"""Error when extracting with unsupported MIME type."""
with pytest.raises(Exception): # noqa: B017
content = Path("text/plain.txt").read_bytes()
mime_type = "application/x-nonexistent"
config = ExtractionConfig()
extract_bytes_sync(content, mime_type, config)

75
e2e/python/tests/test_format_specific.py generated Normal file
View File

@@ -0,0 +1,75 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: format_specific."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import extract_bytes_sync, extract_file_sync
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_format_docx_standalone() -> None:
"""Standalone DOCX extraction using extract_bytes_sync."""
content = Path("docx/fake.docx").read_bytes()
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
result = extract_bytes_sync(content, mime_type, None)
assert len(result.content) >= 20 # noqa: S101
def test_format_hwpx_standalone() -> None:
"""Standalone HWPX extraction using extract_bytes_sync."""
content = Path("hwpx/simple.hwpx").read_bytes()
mime_type = "application/haansofthwpx"
result = extract_bytes_sync(content, mime_type, None)
assert len(result.content) >= 20 # noqa: S101
assert "Hello from HWPX" in result.content # noqa: S101
def test_format_pdf_text() -> None:
"""Standalone PDF text extraction using extract_bytes_sync."""
content = Path("pdf/fake_memo.pdf").read_bytes()
mime_type = "application/pdf"
result = extract_bytes_sync(content, mime_type, None)
assert len(result.content) >= 50 # noqa: S101
assert any(v in result.content for v in ["Mallori", "May"]) # noqa: S101
def test_format_pptx() -> None:
"""PPTX presentation extraction using extract_file_sync."""
path = "pptx/simple.pptx"
mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
_ = extract_file_sync(path, mime_type, None)
def test_format_xlsx() -> None:
"""XLSX spreadsheet extraction using extract_file_sync."""
path = "xlsx/stanley_cups.xlsx"
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
_ = extract_file_sync(path, mime_type, None)

54
e2e/python/tests/test_mime_utilities.py generated Normal file
View File

@@ -0,0 +1,54 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: mime_utilities."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import detect_mime_type_from_bytes, get_extensions_for_mime
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_mime_detect_bytes() -> None:
"""Detect MIME type from file bytes."""
content = Path("pdf/fake_memo.pdf").read_bytes()
result = detect_mime_type_from_bytes(content)
assert "pdf" in result # noqa: S101
def test_mime_detect_image() -> None:
"""Detect MIME type from PNG image bytes."""
content = Path("images/test_hello_world.png").read_bytes()
result = detect_mime_type_from_bytes(content)
assert "png" in result # noqa: S101
def test_mime_get_extensions() -> None:
"""Get file extensions for a MIME type."""
mime_type = "application/pdf"
result = get_extensions_for_mime(mime_type)
assert "pdf" in result # noqa: S101

View File

@@ -0,0 +1,46 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: ocr_backend_management."""
import pytest # noqa: F401
from kreuzberg import clear_ocr_backends, list_ocr_backends, unregister_ocr_backend
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_ocr_backends_clear() -> None:
"""Clear all OCR backends and verify list is empty."""
_ = clear_ocr_backends()
def test_ocr_backends_list() -> None:
"""List all registered OCR backends."""
_ = list_ocr_backends()
def test_ocr_backends_unregister() -> None:
"""Unregister nonexistent OCR backend gracefully."""
name = "nonexistent-backend-xyz"
_ = unregister_ocr_backend(name)

47
e2e/python/tests/test_pdf.py generated Normal file
View File

@@ -0,0 +1,47 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: pdf."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import render_pdf_page_to_png
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_render_pdf_page_first() -> None:
"""render_pdf_page_to_png: first page."""
pdf_bytes = Path("pdf/fake_memo.pdf").read_bytes()
page_index = 0
result = render_pdf_page_to_png(pdf_bytes, page_index)
assert len(result) >= 100 # noqa: S101
def test_render_pdf_page_out_of_range() -> None:
"""render_pdf_page_to_png: page out of range."""
with pytest.raises(Exception): # noqa: B017
pdf_bytes = Path("pdf/fake_memo.pdf").read_bytes()
page_index = 999
render_pdf_page_to_png(pdf_bytes, page_index)

176
e2e/python/tests/test_plugin_api.py generated Normal file
View File

@@ -0,0 +1,176 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: plugin_api."""
import pytest # noqa: F401
from kreuzberg import register_document_extractor, register_embedding_backend, register_ocr_backend, register_post_processor, register_renderer, register_validator, unregister_document_extractor, unregister_embedding_backend, unregister_post_processor, unregister_renderer, unregister_validator, unregister_ocr_backend
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_register_document_extractor_trait_bridge() -> None:
"""register_document_extractor: trait bridge."""
class _TestStub_register_document_extractor_trait_bridge:
def name(self):
return "test-extractor"
def initialize(self):
pass
def shutdown(self):
pass
async def extract_bytes(self, _p0, _p1, _p2):
return {}
def supported_mime_types(self):
return []
_ = register_document_extractor(_TestStub_register_document_extractor_trait_bridge())
unregister_document_extractor("test-extractor")
def test_register_embedding_backend_trait_bridge() -> None:
"""register_embedding_backend: trait bridge."""
class _TestStub_register_embedding_backend_trait_bridge:
def name(self):
return "test-embedding-backend"
def initialize(self):
pass
def shutdown(self):
pass
def dimensions(self):
return 1
async def embed(self, _p0):
return []
_ = register_embedding_backend(_TestStub_register_embedding_backend_trait_bridge())
unregister_embedding_backend("test-embedding-backend")
def test_register_ocr_backend_trait_bridge() -> None:
"""register_ocr_backend: trait bridge."""
class _TestStub_register_ocr_backend_trait_bridge:
def name(self):
return "test-backend"
def initialize(self):
pass
def shutdown(self):
pass
async def process_image(self, _p0, _p1):
return {}
def supports_language(self, _p0):
return False
def backend_type(self):
return {}
_ = register_ocr_backend(_TestStub_register_ocr_backend_trait_bridge())
unregister_ocr_backend("test-backend")
def test_register_post_processor_trait_bridge() -> None:
"""register_post_processor: trait bridge."""
class _TestStub_register_post_processor_trait_bridge:
def name(self):
return "test-processor"
def initialize(self):
pass
def shutdown(self):
pass
async def process(self, _p0, _p1):
return None
def processing_stage(self):
return {}
_ = register_post_processor(_TestStub_register_post_processor_trait_bridge())
unregister_post_processor("test-processor")
def test_register_renderer_trait_bridge() -> None:
"""register_renderer: trait bridge."""
class _TestStub_register_renderer_trait_bridge:
def name(self):
return "test-renderer"
def initialize(self):
pass
def shutdown(self):
pass
def render(self, _p0):
return ""
_ = register_renderer(_TestStub_register_renderer_trait_bridge())
unregister_renderer("test-renderer")
def test_register_validator_trait_bridge() -> None:
"""register_validator: trait bridge."""
class _TestStub_register_validator_trait_bridge:
def name(self):
return "test-validator"
def initialize(self):
pass
def shutdown(self):
pass
async def validate(self, _p0, _p1):
return None
_ = register_validator(_TestStub_register_validator_trait_bridge())
unregister_validator("test-validator")
def test_unregister_document_extractor_after_register() -> None:
"""unregister_document_extractor."""
name = "test-extractor"
_ = unregister_document_extractor(name)
def test_unregister_embedding_backend_after_register() -> None:
"""unregister_embedding_backend."""
name = "test-embedding-backend"
_ = unregister_embedding_backend(name)
def test_unregister_post_processor_after_register() -> None:
"""unregister_post_processor."""
name = "test-processor"
_ = unregister_post_processor(name)
def test_unregister_renderer_after_register() -> None:
"""unregister_renderer."""
name = "test-renderer"
_ = unregister_renderer(name)
def test_unregister_validator_after_register() -> None:
"""unregister_validator."""
name = "test-validator"
_ = unregister_validator(name)

View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: post_processor_management."""
import pytest # noqa: F401
from kreuzberg import clear_post_processors, list_post_processors
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_post_processors_clear() -> None:
"""Clear all post-processors and verify list is empty."""
_ = clear_post_processors()
def test_post_processors_list() -> None:
"""List all registered post-processors."""
_ = list_post_processors()

59
e2e/python/tests/test_registry.py generated Normal file
View File

@@ -0,0 +1,59 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: registry."""
import pytest # noqa: F401
from kreuzberg import list_document_extractors, list_embedding_backends, list_ocr_backends, list_post_processors, list_renderers, list_validators
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_list_document_extractors() -> None:
"""List document extractors."""
_ = list_document_extractors()
def test_list_embedding_backends() -> None:
"""List embedding backends."""
_ = list_embedding_backends()
def test_list_ocr_backends() -> None:
"""List OCR backends."""
_ = list_ocr_backends()
def test_list_post_processors() -> None:
"""List post-processors."""
_ = list_post_processors()
def test_list_renderers() -> None:
"""List renderers."""
_ = list_renderers()
def test_list_validators() -> None:
"""List validators."""
_ = list_validators()

View File

@@ -0,0 +1,50 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: registry_operations."""
import pytest # noqa: F401
from kreuzberg import get_extensions_for_mime
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_extensions_docx() -> None:
"""Get file extensions for DOCX MIME type."""
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
_ = get_extensions_for_mime(mime_type)
def test_extensions_html() -> None:
"""Get file extensions for HTML MIME type."""
mime_type = "text/html"
_ = get_extensions_for_mime(mime_type)
def test_extensions_pdf() -> None:
"""Get file extensions for PDF MIME type."""
mime_type = "application/pdf"
_ = get_extensions_for_mime(mime_type)

View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: renderer_management."""
import pytest # noqa: F401
from kreuzberg import clear_renderers, list_renderers
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_renderers_clear() -> None:
"""Clear all renderers and verify list is empty."""
_ = clear_renderers()
def test_renderers_list() -> None:
"""List all registered renderers."""
_ = list_renderers()

148
e2e/python/tests/test_smoke.py generated Normal file
View File

@@ -0,0 +1,148 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: smoke."""
from pathlib import Path
import pytest # noqa: F401
from kreuzberg import extract_bytes, extract_file, ExtractionConfig
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
@pytest.mark.asyncio
async def test_ocr_image_png() -> None:
"""OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge."""
content = Path("images/test_hello_world.png").read_bytes()
mime_type = "image/png"
config = ExtractionConfig()
result = await extract_bytes(content, mime_type, config)
assert result.mime_type.strip() == "image/png" # noqa: S101
assert len(result.content) >= 1 # noqa: S101
assert any(v in result.content for v in ["Hello", "World", "hello", "world"]) # noqa: S101
@pytest.mark.asyncio
async def test_smoke_docx_basic() -> None:
"""Smoke test: DOCX with formatted text."""
path = "docx/fake.docx"
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: S101
assert len(result.content) >= 20 # noqa: S101
assert any(v in result.content for v in ["Lorem", "ipsum", "document", "text"]) # noqa: S101
@pytest.mark.asyncio
async def test_smoke_html_basic() -> None:
"""Smoke test: HTML table extraction."""
path = "html/simple_table.html"
mime_type = "text/html"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "text/html" # noqa: S101
assert len(result.content) >= 10 # noqa: S101
assert any(v in result.content for v in ["Sample Data Table", "Laptop", "Electronics", "Product"]) # noqa: S101
@pytest.mark.asyncio
async def test_smoke_image_png() -> None:
"""Smoke test: PNG image (without OCR, metadata only)."""
path = "images/sample.png"
config = ExtractionConfig(disable_ocr=True)
result = await extract_file(path, None, config)
assert result.mime_type.strip() == "image/png" # noqa: S101
@pytest.mark.asyncio
async def test_smoke_json_basic() -> None:
"""Smoke test: JSON file extraction."""
path = "json/simple.json"
mime_type = "application/json"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "application/json" # noqa: S101
assert len(result.content) >= 5 # noqa: S101
@pytest.mark.asyncio
async def test_smoke_pdf_basic() -> None:
"""Smoke test: PDF with simple text extraction."""
path = "pdf/fake_memo.pdf"
mime_type = "application/pdf"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "application/pdf" # noqa: S101
assert len(result.content) >= 50 # noqa: S101
assert any(v in result.content for v in ["May 5, 2023", "To Whom it May Concern"]) # noqa: S101
@pytest.mark.asyncio
async def test_smoke_txt_basic() -> None:
"""Smoke test: Plain text file."""
path = "text/report.txt"
mime_type = "text/plain"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "text/plain" # noqa: S101
assert len(result.content) >= 5 # noqa: S101
@pytest.mark.asyncio
async def test_smoke_xlsx_basic() -> None:
"""Smoke test: XLSX with basic spreadsheet data including tables."""
path = "xlsx/stanley_cups.xlsx"
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
config = ExtractionConfig()
result = await extract_file(path, mime_type, config)
assert result.mime_type.strip() == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # noqa: S101
assert len(result.content) >= 100 # noqa: S101
assert "Team" in result.content # noqa: S101
assert "Location" in result.content # noqa: S101
assert "Stanley Cups" in result.content # noqa: S101
assert "Blues" in result.content # noqa: S101
assert "Flyers" in result.content # noqa: S101
assert "Maple Leafs" in result.content # noqa: S101
assert "STL" in result.content # noqa: S101
assert "PHI" in result.content # noqa: S101
assert "TOR" in result.content # noqa: S101
# skipped: field 'tables' not available on result type
# skipped: field 'metadata.format.excel.sheet_count' not available on result type
# skipped: field 'metadata.format.excel.sheet_names' not available on result type

View File

@@ -0,0 +1,39 @@
# This file is auto-generated by alef — DO NOT EDIT.
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
# To regenerate: alef generate
# To verify freshness: alef verify --exit-code
# Issues & docs: https://github.com/kreuzberg-dev/alef
"""E2e tests for category: validator_management."""
import pytest # noqa: F401
from kreuzberg import clear_validators, list_validators
def _alef_e2e_text(value: object) -> str:
return "" if value is None else str(value)
def _alef_e2e_item_texts(item: object) -> tuple[str, ...]:
raw_items = getattr(item, "items", None)
items_text = " ".join(str(value) for value in raw_items) if isinstance(raw_items, list) else ""
return (
_alef_e2e_text(item),
_alef_e2e_text(getattr(item, "kind", None)),
_alef_e2e_text(getattr(item, "name", None)),
_alef_e2e_text(getattr(item, "source", None)),
_alef_e2e_text(getattr(item, "alias", None)),
_alef_e2e_text(getattr(item, "text", None)),
_alef_e2e_text(getattr(item, "signature", None)),
items_text,
)
def test_validators_clear() -> None:
"""Clear all validators and verify list is empty."""
_ = clear_validators()
def test_validators_list() -> None:
"""List all registered validators."""
_ = list_validators()