Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
{
"id": "async_extract_bytes",
"category": "async",
"description": "Async extract_bytes call on PDF document",
"tags": ["async", "api", "extract_bytes"],
"call": "extract_bytes",
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
},
"input": {
"data": "pdf/fake_memo.pdf",
"mime_type": "application/pdf"
},
"assertions": [
{
"type": "not_error"
},
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 50
}
]
}

View File

@@ -0,0 +1,9 @@
{
"id": "async_extract_bytes_empty_mime",
"category": "async",
"description": "extract_bytes empty MIME async",
"tags": ["async", "error"],
"call": "extract_bytes",
"input": { "data": "text/plain.txt", "mime_type": "", "config": {} },
"assertions": [{ "type": "error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "async_extract_bytes_invalid_mime",
"category": "async",
"description": "extract_bytes unsupported MIME async",
"tags": ["async", "error"],
"call": "extract_bytes",
"input": { "data": "text/plain.txt", "mime_type": "application/x-nonexistent", "config": {} },
"assertions": [{ "type": "error" }]
}

View File

@@ -0,0 +1,11 @@
{
"id": "batch_bytes_invalid_mime",
"category": "batch",
"description": "batch_extract_bytes_sync invalid MIME",
"tags": ["batch", "error"],
"call": "batch_extract_bytes_sync",
"input": {
"items": [{ "content": [72, 101, 108, 108, 111], "mime_type": "application/x-nonexistent" }]
},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,30 @@
{
"id": "batch_extract_bytes_happy",
"category": "batch",
"description": "batch_extract_bytes: happy path with mixed inputs",
"call": "batch_extract_bytes",
"input": {
"items": [
{
"content": [72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33],
"mime_type": "text/plain"
},
{
"content": [
60, 104, 116, 109, 108, 62, 60, 98, 111, 100, 121, 62, 84, 101, 115, 116, 60, 47, 98, 111,
100, 121, 62, 60, 47, 104, 116, 109, 108, 62
],
"mime_type": "text/html"
}
]
},
"assertions": [
{
"type": "not_error"
},
{
"type": "count_min",
"value": 1
}
]
}

View File

@@ -0,0 +1,19 @@
{
"id": "batch_extract_bytes_mixed_format",
"category": "batch",
"description": "batch_extract_bytes: handles unsupported MIME gracefully",
"call": "batch_extract_bytes",
"input": {
"items": [
{
"content": [80, 68, 70, 32, 112, 108, 97, 99, 101, 104, 111, 108, 100, 101, 114],
"mime_type": "application/x-unknown"
}
]
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,51 @@
{
"id": "batch_extract_bytes_size_cap",
"category": "batch",
"description": "batch_extract_bytes: archive size cap triggers error",
"call": "batch_extract_bytes",
"skip": {
"languages": [
"rust",
"node",
"python",
"php",
"wasm",
"go",
"r",
"ruby",
"csharp",
"elixir",
"kotlin",
"kotlin_android",
"swift",
"zig",
"java",
"dart"
],
"reason": "SecurityLimits.max_content_size is only enforced by archive/Excel extractors; test requires actual archive format to trigger error, which is not easily testable via byte fixtures"
},
"input": {
"items": [
{
"content": [
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 97
],
"mime_type": "text/plain"
}
],
"config": {
"security_limits": {
"max_content_size": 1
}
}
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "batch_extract_bytes_sync_empty_list",
"category": "batch",
"description": "batch_extract_bytes_sync: empty batch",
"call": "batch_extract_bytes_sync",
"input": {
"items": []
},
"assertions": [
{
"type": "not_error"
},
{
"type": "count_equals",
"value": 0
}
]
}

View File

@@ -0,0 +1,19 @@
{
"id": "batch_extract_bytes_sync_invalid_mime",
"category": "batch",
"description": "batch_extract_bytes_sync: unsupported MIME",
"call": "batch_extract_bytes_sync",
"input": {
"items": [
{
"content": [100, 97, 116, 97],
"mime_type": "application/x-unknown"
}
]
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,33 @@
{
"id": "batch_file_async_basic",
"category": "batch",
"description": "Extract text from multiple files asynchronously",
"tags": [
"batch",
"async",
"concurrent",
"multiple_files"
],
"call": "batch_extract_files",
"input": {
"paths": [
{
"path": "pdf/fake_memo.pdf"
},
{
"path": "text/fake_text.txt"
}
]
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,28 @@
{
"id": "batch_file_async_not_found",
"category": "batch",
"description": "batch_extract_file async nonexistent",
"tags": [
"batch",
"async"
],
"call": "batch_extract_files",
"input": {
"paths": [
{
"path": "/nonexistent/a.pdf"
}
]
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,31 @@
{
"id": "batch_file_not_found",
"category": "batch",
"description": "batch_extract_file_sync nonexistent",
"tags": [
"batch",
"error"
],
"call": "batch_extract_files_sync",
"input": {
"paths": [
{
"path": "/nonexistent/a.pdf"
},
{
"path": "/nonexistent/b.txt"
}
]
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "batch_file_partial",
"category": "batch",
"description": "batch_extract_file_sync mixed",
"tags": [
"batch"
],
"call": "batch_extract_files_sync",
"input": {
"paths": [
{
"path": "text/plain.txt"
},
{
"path": "/nonexistent/missing.pdf"
}
]
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,32 @@
{
"id": "batch_file_sync_basic",
"category": "batch",
"description": "Extract text from multiple files synchronously",
"tags": [
"batch",
"sync",
"multiple_files"
],
"call": "batch_extract_files_sync",
"input": {
"paths": [
{
"path": "pdf/fake_memo.pdf"
},
{
"path": "text/fake_text.txt"
}
]
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,41 @@
{
"id": "code_shebang_detection",
"category": "code",
"description": "Test language detection from shebang line via bytes input",
"tags": [
"code",
"shebang",
"tree-sitter"
],
"call": "extract_file_sync",
"input": {
"path": "code/script.sh",
"mime_type": "text/x-source-code"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "text/x-source-code"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_all",
"field": "content",
"values": [
"build",
"clean"
]
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "api_batch_bytes_async",
"description": "Tests async batch bytes extraction API (batch_extract_bytes)",
"tags": ["contract", "api", "batch"],
"call": "extract_file",
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
},
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
]
}

View File

@@ -0,0 +1,33 @@
{
"id": "api_batch_bytes_with_configs_async",
"description": "Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)",
"tags": ["contract", "api", "batch", "file_config"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "api_batch_file_async",
"description": "Tests async batch file extraction API (batch_extract_file)",
"tags": ["contract", "api", "batch"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,33 @@
{
"id": "api_batch_file_with_configs_async",
"description": "Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)",
"tags": ["contract", "api", "batch", "file_config"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "api_extract_bytes_async",
"description": "Tests async bytes extraction API (extract_bytes)",
"tags": ["contract", "api"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,29 @@
{
"id": "api_extract_file_async",
"description": "Tests async file extraction API (extract_file)",
"tags": ["contract", "api"],
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,52 @@
{
"id": "config_chunking_prepend_heading_context",
"description": "Tests markdown chunker prepends heading hierarchy to chunk content",
"tags": [
"contract",
"config",
"chunking",
"heading-context"
],
"call": "extract_file_sync",
"input": {
"path": "markdown/extraction_test.md",
"config": {
"chunking": {
"chunker_type": "markdown",
"max_chars": 300,
"max_overlap": 50,
"prepend_heading_context": true
}
}
},
"assertions": [
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "count_min",
"field": "chunks",
"value": 2
},
{
"type": "is_true",
"field": "chunks_have_content"
},
{
"type": "is_true",
"field": "chunks_have_heading_context"
},
{
"type": "is_true",
"field": "first_chunk_starts_with_heading"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,37 @@
{
"id": "config_document_structure_with_headings",
"description": "Tests document structure with DOCX heading-driven nesting",
"tags": [
"contract",
"document_structure"
],
"call": "extract_file_sync",
"input": {
"path": "docx/fake.docx",
"config": {
"include_document_structure": true
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
{
"type": "not_empty",
"field": "document"
},
{
"type": "count_min",
"field": "document.nodes",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,36 @@
{
"id": "config_element_types",
"description": "Tests element-based result format with element type assertions on DOCX",
"tags": [
"contract",
"config",
"result_format"
],
"call": "extract_file_sync",
"input": {
"path": "docx/unit_test_headers.docx",
"config": {
"result_format": "element_based"
}
},
"assertions": [
{
"type": "contains_any",
"field": "mime_type",
"values": [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
},
{
"type": "count_min",
"field": "elements",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,32 @@
{
"id": "config_embedding_plugin",
"description": "Tests EmbeddingModelType::Plugin variant deserialization in ChunkingConfig — config accepts the plugin variant shape; actual dispatch requires a host-language backend registered via register_embedding_backend at runtime",
"tags": ["contract", "config", "embeddings", "plugin"],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"chunking": {
"max_chars": 500,
"max_overlap": 50,
"embedding": {
"model": {
"type": "plugin",
"name": "test-plugin-backend"
},
"normalize": true,
"max_embed_duration_secs": 30
}
}
}
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": ["python", "rust", "node", "go", "ruby", "elixir", "wasm", "java", "csharp", "php", "r", "dart", "kotlin_android", "swift", "zig"],
"reason": "EmbeddingModelType::Plugin requires a host-language backend registered via register_embedding_backend before dispatch; the e2e harness cannot register one. This fixture validates config round-trip (the {\"type\":\"plugin\",\"name\":...} shape is accepted by every binding's EmbeddingConfig)."
}
}

View File

@@ -0,0 +1,34 @@
{
"id": "config_extraction_timeout",
"description": "Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions",
"tags": [
"contract",
"config",
"timeout"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"extraction_timeout_secs": 300
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,46 @@
{
"id": "config_keywords",
"description": "Tests keyword extraction via YAKE algorithm",
"tags": [
"contract",
"config",
"keywords"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"keywords": {
"algorithm": "yake",
"max_keywords": 10
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "keywords"
},
{
"type": "count_min",
"field": "keywords",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,52 @@
{
"id": "config_llm_structured_extraction",
"description": "Tests structured extraction via liter-llm with JSON schema",
"tags": ["contract", "config", "liter-llm", "structured-extraction"],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"structured_extraction": {
"schema": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"date": {
"type": "string"
},
"summary": {
"type": "string"
}
},
"required": ["title"]
},
"schema_name": "memo_data",
"llm": {
"model": "openai/gpt-4o"
}
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "structured_output"
}
],
"skip": {
"languages": ["python", "rust", "node", "go", "ruby", "elixir", "wasm", "java", "csharp", "php", "r", "dart", "kotlin_android", "swift", "zig"],
"reason": "Requires liter-llm feature and KREUZBERG_LLM_API_KEY env var; runtime-only skip"
}
}

View File

@@ -0,0 +1,43 @@
{
"id": "config_pages",
"description": "Tests page extraction and page marker configuration",
"tags": [
"contract",
"config"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"pages": {
"extract_pages": true,
"insert_page_markers": true
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": [
"PAGE"
]
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,48 @@
{
"id": "config_quality_enabled",
"description": "Tests quality scoring produces a score value in [0.0, 1.0]",
"tags": [
"contract",
"config",
"quality"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"enable_quality_processing": true
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "quality_score"
},
{
"type": "greater_than_or_equal",
"field": "quality_score",
"value": 0.0
},
{
"type": "less_than_or_equal",
"field": "quality_score",
"value": 1.0
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,41 @@
{
"id": "config_security_limits",
"description": "Tests archive extraction with custom security limits",
"tags": [
"contract",
"config",
"security"
],
"call": "extract_file_sync",
"input": {
"path": "archives/documents.zip",
"config": {
"security_limits": {
"max_archive_size": 104857600,
"max_compression_ratio": 50,
"max_files_in_archive": 100
}
}
},
"assertions": [
{
"type": "contains_any",
"field": "mime_type",
"values": [
"application/zip",
"application/x-zip-compressed"
]
},
{
"type": "min_length",
"field": "content",
"value": 10
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,51 @@
{
"id": "config_tree_sitter",
"description": "Tests tree-sitter configuration round-trip",
"tags": [
"contract",
"config",
"tree-sitter"
],
"call": "extract_file_sync",
"input": {
"path": "code/hello.py",
"config": {
"tree_sitter": {
"languages": [
"python",
"rust"
],
"groups": [
"web"
],
"process": {
"structure": true,
"imports": true,
"exports": true,
"comments": false,
"docstrings": false,
"symbols": false,
"diagnostics": false
}
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "text/x-source-code"
},
{
"type": "min_length",
"field": "content",
"value": 5
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "output_format_bytes_markdown",
"description": "Tests markdown output format via bytes extraction API",
"tags": ["contract", "output_format", "bytes"],
"call": "extract_bytes_sync",
"input": {
"data": "pdf/fake_memo.pdf",
"mime_type": "application/pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
]
}

View File

@@ -0,0 +1,38 @@
{
"id": "output_format_markdown",
"description": "Tests Markdown output format",
"tags": [
"contract",
"output_format"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,11 @@
{
"id": "detect_mime_bytes_html",
"category": "detection",
"description": "Detect HTML MIME from bytes",
"tags": ["mime_detection", "bytes"],
"call": "detect_mime_type_from_bytes",
"input": {
"data": "html/html.html"
},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,15 @@
{
"id": "detect_mime_bytes_pdf",
"category": "detection",
"description": "Detect PDF MIME type from bytes",
"tags": ["mime_detection", "bytes", "pdf"],
"call": "detect_mime_type_from_bytes",
"input": {
"data": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,15 @@
{
"id": "detect_mime_bytes_png",
"category": "detection",
"description": "Detect PNG MIME type from bytes",
"tags": ["mime_detection", "bytes", "png"],
"call": "detect_mime_type_from_bytes",
"input": {
"data": "images/test_hello_world.png"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,11 @@
{
"id": "get_extensions_unknown_mime",
"category": "detection",
"description": "get_extensions unknown MIME",
"tags": ["mime_detection", "error"],
"call": "get_extensions_for_mime",
"input": {
"mime_type": "application/x-totally-unknown"
},
"assertions": [{ "type": "error" }]
}

View File

@@ -0,0 +1,19 @@
{
"id": "embed_texts_async_empty_input",
"category": "embed_async_pending",
"description": "embed_texts_async: empty text list",
"call": "embed_texts_async",
"input": {
"texts": []
},
"assertions": [
{
"type": "not_error"
},
{
"type": "count_equals",
"field": "embeddings",
"value": 0
}
]
}

View File

@@ -0,0 +1,22 @@
{
"id": "embed_texts_async_happy",
"category": "embed_async_pending",
"description": "embed_texts_async: basic async embedding",
"call": "embed_texts_async",
"input": {
"texts": [
"First",
"Second"
]
},
"assertions": [
{
"type": "not_error"
},
{
"type": "count_min",
"field": "embeddings",
"value": 2
}
]
}

View File

@@ -0,0 +1,22 @@
{
"id": "embed_texts_async_preset_switch",
"category": "embed_async_pending",
"description": "embed_texts_async: preset override",
"call": "embed_texts_async",
"input": {
"texts": [
"Text"
],
"config": {
"model": {
"type": "preset",
"name": "balanced"
}
}
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,21 @@
{
"id": "embed_texts_batch",
"category": "embed_extra",
"description": "Batch embed texts",
"tags": ["embedding", "batch"],
"call": "embed_texts",
"input": {
"texts": ["Hello", "World"],
"config": {
"model": {
"type": "preset",
"name": "balanced"
}
}
},
"assertions": [{ "type": "not_error" }],
"skip": {
"languages": ["wasm"],
"reason": "embeddings feature depends on ONNX Runtime which is not available on the WASM target"
}
}

View File

@@ -0,0 +1,28 @@
{
"id": "embed_texts_different_preset",
"category": "embeddings",
"description": "embed_texts: multilingual preset",
"call": "embed_texts",
"input": {
"texts": ["Hello world", "Test"],
"config": {
"model": {
"type": "preset",
"name": "multilingual"
}
}
},
"assertions": [
{
"type": "not_error"
},
{
"type": "count_min",
"field": "embeddings",
"value": 2
}
],
"skip": {
"languages": ["wasm"]
}
}

View File

@@ -0,0 +1,17 @@
{
"id": "get_embedding_preset_known",
"category": "embeddings",
"description": "get_embedding_preset: known preset",
"call": "get_embedding_preset",
"input": {
"preset_name": "balanced"
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": ["wasm"]
}
}

View File

@@ -0,0 +1,17 @@
{
"id": "get_embedding_preset_nominal",
"category": "embeddings",
"description": "get_embedding_preset: nominal case",
"call": "get_embedding_preset",
"input": {
"preset_name": "balanced"
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": ["wasm"]
}
}

View File

@@ -0,0 +1,17 @@
{
"id": "get_embedding_preset_unknown",
"category": "embeddings",
"description": "get_embedding_preset: unknown preset fails",
"call": "get_embedding_preset",
"input": {
"preset_name": "nonexistent-xyz"
},
"assertions": [
{
"type": "is_empty"
}
],
"skip": {
"languages": ["wasm"]
}
}

View File

@@ -0,0 +1,15 @@
{
"id": "list_embedding_presets_sanity",
"category": "embeddings",
"description": "list_embedding_presets: returns at least one",
"call": "list_embedding_presets",
"input": {},
"assertions": [
{
"type": "not_empty"
}
],
"skip": {
"languages": ["wasm"]
}
}

View File

@@ -0,0 +1,17 @@
{
"id": "error_empty_bytes",
"category": "error",
"description": "Graceful handling of empty bytes (should not error)",
"tags": ["error", "input", "edge-case"],
"call": "extract_bytes_sync",
"input": {
"data": "text/empty.txt",
"mime_type": "text/plain",
"config": {}
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "error_empty_mime",
"category": "error",
"description": "Error when extracting with empty MIME type",
"tags": ["error", "input", "mime"],
"call": "extract_bytes_sync",
"input": {
"data": "text/plain.txt",
"mime_type": "",
"config": {}
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,13 @@
{
"id": "error_extract_bytes_conflicting_ocr",
"category": "error",
"description": "extract_bytes force+disable OCR",
"tags": ["error", "validation"],
"call": "extract_bytes_sync",
"input": {
"data": "text/fake_text.txt",
"mime_type": "text/plain",
"config": { "force_ocr": true, "disable_ocr": true }
},
"assertions": [{ "type": "error" }]
}

View File

@@ -0,0 +1,17 @@
{
"id": "error_invalid_mime_format",
"category": "error",
"description": "Error when extracting with invalid MIME type format",
"tags": ["error", "input", "mime"],
"call": "extract_bytes_sync",
"input": {
"data": "text/plain.txt",
"mime_type": "not-a-mime",
"config": {}
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "error_unsupported_mime",
"category": "error",
"description": "Error when extracting with unsupported MIME type",
"tags": ["error", "input", "mime"],
"call": "extract_bytes_sync",
"input": {
"data": "text/plain.txt",
"mime_type": "application/x-nonexistent",
"config": {}
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,21 @@
{
"id": "format_docx_standalone",
"category": "format_specific",
"description": "Standalone DOCX extraction using extract_bytes_sync",
"tags": ["format_specific", "docx", "text_extraction"],
"call": "extract_bytes_sync",
"input": {
"data": "docx/fake.docx",
"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
"assertions": [
{
"type": "not_error"
},
{
"type": "min_length",
"field": "content",
"value": 20
}
]
}

View File

@@ -0,0 +1,26 @@
{
"id": "format_hwpx_standalone",
"category": "format_specific",
"description": "Standalone HWPX extraction using extract_bytes_sync",
"tags": ["format_specific", "hwpx", "text_extraction"],
"call": "extract_bytes_sync",
"input": {
"data": "hwpx/simple.hwpx",
"mime_type": "application/haansofthwpx"
},
"assertions": [
{
"type": "not_error"
},
{
"type": "min_length",
"field": "content",
"value": 20
},
{
"type": "contains",
"field": "content",
"value": "Hello from HWPX"
}
]
}

View File

@@ -0,0 +1,26 @@
{
"id": "format_pdf_text",
"category": "format_specific",
"description": "Standalone PDF text extraction using extract_bytes_sync",
"tags": ["format_specific", "pdf", "text_extraction"],
"call": "extract_bytes_sync",
"input": {
"data": "pdf/fake_memo.pdf",
"mime_type": "application/pdf"
},
"assertions": [
{
"type": "not_error"
},
{
"type": "min_length",
"field": "content",
"value": 50
},
{
"type": "contains_any",
"field": "content",
"values": ["Mallori", "May"]
}
]
}

View File

@@ -0,0 +1,26 @@
{
"id": "format_pptx",
"category": "format_specific",
"description": "PPTX presentation extraction using extract_file_sync",
"tags": [
"format_specific",
"pptx",
"text_extraction"
],
"call": "extract_file_sync",
"input": {
"path": "pptx/simple.pptx",
"mime_type": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,26 @@
{
"id": "format_xlsx",
"category": "format_specific",
"description": "XLSX spreadsheet extraction using extract_file_sync",
"tags": [
"format_specific",
"xlsx",
"text_extraction"
],
"call": "extract_file_sync",
"input": {
"path": "xlsx/stanley_cups.xlsx",
"mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 911 B

View File

@@ -0,0 +1,21 @@
{
"id": "render_pdf_page_first",
"category": "pdf",
"description": "render_pdf_page_to_png: first page",
"call": "render_pdf_page_to_png",
"input": {
"pdf_bytes": "pdf/fake_memo.pdf",
"page_index": 0,
"dpi": null,
"password": null
},
"assertions": [
{
"type": "not_error"
},
{
"type": "min_length",
"value": 100
}
]
}

View File

@@ -0,0 +1,36 @@
{
"id": "render_pdf_page_missing_file",
"category": "pdf",
"description": "render_pdf_page_to_png: missing file",
"call": "render_pdf_page_to_png",
"skip": {
"languages": [
"python",
"node",
"ruby",
"php",
"ffi",
"go",
"java",
"csharp",
"elixir",
"wasm",
"r",
"dart",
"kotlin_android",
"swift",
"zig",
"rust"
],
"reason": "render_pdf_page_to_png takes pre-loaded pdf_bytes; the harness materializes file contents at generation time, so a runtime missing-file error path is not expressible via this fixture shape"
},
"input": {
"pdf_path": "nonexistent/file.pdf",
"page_index": 0
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "render_pdf_page_out_of_range",
"category": "pdf",
"description": "render_pdf_page_to_png: page out of range",
"call": "render_pdf_page_to_png",
"input": {
"pdf_bytes": "pdf/fake_memo.pdf",
"page_index": 999,
"dpi": null,
"password": null
},
"assertions": [
{
"type": "error"
}
]
}

View File

@@ -0,0 +1,279 @@
# Plugin API Test Fixtures
This directory contains fixtures for generating E2E tests for plugin/config/utility APIs across all language bindings.
## Purpose
Unlike document extraction fixtures (in parent `fixtures/` directory), these fixtures test:
- Plugin management APIs (validators, post-processors, OCR backends, document extractors)
- Configuration loading APIs (`from_file`, `discover`)
- MIME utility APIs (`detect_mime_type`, `get_extensions_for_mime`, etc.)
## Schema
See `schema.json` for the complete JSON schema definition.
## Fixture Structure
Each fixture is a JSON file defining:
- **id**: Unique identifier (e.g., `validators_list`)
- **api_category**: Category of API (`validator_management`, `configuration`, `mime_utilities`, etc.)
- **api_function**: Function name being tested (snake_case format)
- **test_spec**: Test specification including:
- **pattern**: Test pattern type (see patterns below)
- **setup**: Optional setup steps (temp files, directories, etc.)
- **function_call**: Function to call with arguments
- **assertions**: Expected behavior and values
- **teardown**: Optional cleanup steps
## Test Patterns
### 1. `simple_list`
Lists items from a registry. No setup required.
**Example**: `validators_list.json`
```json
{
"pattern": "simple_list",
"function_call": { "name": "list_validators", "args": [] },
"assertions": { "return_type": "list", "list_item_type": "string" }
}
```
### 2. `clear_registry`
Clears a registry and verifies it's empty.
**Example**: `validators_clear.json`
```json
{
"pattern": "clear_registry",
"function_call": { "name": "clear_validators", "args": [] },
"assertions": { "return_type": "void", "verify_cleanup": true }
}
```
### 3. `graceful_unregister`
Attempts to unregister a nonexistent item without error.
**Example**: `ocr_backends_unregister.json`
```json
{
"pattern": "graceful_unregister",
"function_call": { "name": "unregister_ocr_backend", "args": ["nonexistent-backend-xyz"] },
"assertions": { "does_not_throw": true }
}
```
### 4. `config_from_file`
Creates a temp TOML file, loads config, verifies properties.
**Example**: `config_from_file.json`
```json
{
"pattern": "config_from_file",
"setup": {
"create_temp_file": true,
"temp_file_name": "test_config.toml",
"temp_file_content": "[chunking]\\nmax_chars = 100\\n"
},
"function_call": {
"name": "from_file",
"is_method": true,
"class_name": "ExtractionConfig",
"args": ["${temp_file_path}"]
},
"assertions": {
"object_properties": [{ "path": "chunking.max_chars", "value": 100 }]
}
}
```
### 5. `config_discover`
Creates config in parent dir, changes to subdirectory, discovers config.
**Example**: `config_discover.json`
- Creates `kreuzberg.toml` in temp dir
- Creates subdirectory and changes to it
- Calls `ExtractionConfig.discover()`
- Verifies config was found from parent
### 6. `mime_from_bytes`
Detects MIME type from byte content.
**Example**: `mime_detect_bytes.json`
```json
{
"pattern": "mime_from_bytes",
"setup": { "test_data": "%PDF-1.4\\n" },
"function_call": { "name": "detect_mime_type", "args": ["${test_data_bytes}"] },
"assertions": { "string_contains": "pdf" }
}
```
### 7. `mime_from_path`
Creates temp file, detects MIME from path.
**Example**: `mime_detect_path.json`
### 8. `mime_extension_lookup`
Queries extensions for a MIME type.
**Example**: `mime_get_extensions.json`
## Variable Substitution
Fixtures can use variables in `args`:
- `${temp_file_path}` - Path to created temp file
- `${temp_dir_path}` - Path to created temp directory
- `${test_data_bytes}` - Byte data from `setup.test_data`
## Language-Specific Handling
The generator translates fixtures to language-specific code:
### Function Names
- Fixture: `list_validators` (snake_case)
- Python: `list_validators()`
- TypeScript: `listValidators()`
- Ruby: `list_validators`
- Java: `listValidators()`
- Go: `ListValidators()`
### Class Methods
- Fixture: `ExtractionConfig.from_file`
- Python: `ExtractionConfig.from_file()`
- TypeScript: `ExtractionConfig.fromFile()`
- Ruby: `Config::Extraction.from_file`
- Java: `ExtractionConfig.fromFile()`
- Go: `ConfigFromFile()`
### Temp File Handling
- Python: `tmp_path` fixture (pytest)
- TypeScript: `fs.mkdtempSync()` + `fs.rmSync()`
- Ruby: `Dir.mktmpdir { }` block
- Java: `@TempDir` annotation
- Go: `t.TempDir()`
### Assertions
- Python: `assert` statements
- TypeScript: `expect().toBe()` (Vitest)
- Ruby: `expect().to` (RSpec)
- Java: `assertEquals()` (JUnit)
- Go: `if err != nil` checks
## Special Cases
### Go Lazy Initialization
Document extractors in Go are lazily initialized. The fixture `extractors_list.json` includes:
```json
{
"setup": {
"lazy_init_required": {
"languages": ["go"],
"init_action": "extract_file_sync",
"init_data": {
"create_temp_file": true,
"temp_file_name": "test.pdf",
"temp_file_content": "%PDF-1.4\\n%EOF\\n"
}
}
}
}
```
The generator will produce Go-specific setup code to extract a PDF before listing extractors.
## Fixture Inventory
### Validator Management (2 fixtures)
- `validators_list.json` - List all validators
- `validators_clear.json` - Clear validators
### Post-Processor Management (2 fixtures)
- `post_processors_list.json` - List all post-processors
- `post_processors_clear.json` - Clear post-processors
### OCR Backend Management (3 fixtures)
- `ocr_backends_list.json` - List all OCR backends
- `ocr_backends_unregister.json` - Unregister nonexistent backend
- `ocr_backends_clear.json` - Clear OCR backends
### Document Extractor Management (3 fixtures)
- `extractors_list.json` - List all extractors (with Go lazy init)
- `extractors_unregister.json` - Unregister nonexistent extractor
- `extractors_clear.json` - Clear extractors
### Configuration APIs (2 fixtures)
- `config_from_file.json` - Load config from TOML file
- `config_discover.json` - Discover config from directory tree
### MIME Utilities (3 fixtures)
- `mime_detect_bytes.json` - Detect MIME from bytes
- `mime_detect_path.json` - Detect MIME from file path
- `mime_get_extensions.json` - Get extensions for MIME type
**Total**: 15 fixtures → 75 generated tests (15 per language × 5 languages)
## Regenerating Tests
After modifying fixtures, regenerate tests:
```bash
# Regenerate for all languages
cargo run -p kreuzberg-e2e-generator -- generate --lang python
cargo run -p kreuzberg-e2e-generator -- generate --lang typescript
cargo run -p kreuzberg-e2e-generator -- generate --lang ruby
cargo run -p kreuzberg-e2e-generator -- generate --lang java
cargo run -p kreuzberg-e2e-generator -- generate --lang go
```
Or use the task runner:
```bash
task e2e:generate
```
## Adding New Fixtures
1. Create JSON file following `schema.json`
2. Choose appropriate test pattern
3. Define setup/teardown if needed
4. Specify assertions
5. Regenerate tests
6. Verify tests compile and pass
## Notes
- **DO NOT** write E2E tests by hand
- **ALL** E2E tests must be generated from fixtures
- This is non-negotiable architecture
- Hand-written tests will be rejected by CI

View File

@@ -0,0 +1,17 @@
{
"id": "document_extractors_clear",
"category": "document_extractor_management",
"description": "Clear all document extractors and verify list is empty",
"tags": [
"document_extractor",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_document_extractors",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "embedding_backends_clear",
"category": "embedding_backend_management",
"description": "Clear all embedding backends and verify list is empty",
"tags": [
"embedding",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_embedding_backends",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,12 @@
{
"id": "embedding_backends_list",
"category": "embedding_backend_management",
"description": "List all registered embedding backends",
"tags": ["embedding", "plugin_management", "list"],
"call": "list_embedding_backends",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,25 @@
{
"id": "extractors_list",
"category": "document_extractor_management",
"description": "List all registered document extractors",
"tags": ["extractors", "plugin_management", "list"],
"call": "list_document_extractors",
"input": {
"setup": {
"lazy_init_required": {
"languages": ["go"],
"init_action": "extract_file_sync",
"init_data": {
"create_temp_file": true,
"temp_file_name": "test.pdf",
"temp_file_content": "%PDF-1.4\n%EOF\n"
}
}
}
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "mime_detect_bytes",
"category": "mime_utilities",
"description": "Detect MIME type from file bytes",
"tags": ["mime", "detection", "bytes"],
"call": "detect_mime_type_from_bytes",
"input": {
"data": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "contains",
"field": "result",
"value": "pdf"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "mime_detect_image",
"category": "mime_utilities",
"description": "Detect MIME type from PNG image bytes",
"tags": ["mime", "detection", "image", "bytes"],
"call": "detect_mime_type_from_bytes",
"input": {
"data": "images/test_hello_world.png"
},
"assertions": [
{
"type": "contains",
"field": "result",
"value": "png"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "mime_get_extensions",
"category": "mime_utilities",
"description": "Get file extensions for a MIME type",
"tags": ["mime", "extensions", "lookup"],
"call": "get_extensions_for_mime",
"input": {
"mime_type": "application/pdf"
},
"assertions": [
{
"type": "contains",
"field": "result",
"value": "pdf"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "ocr_backends_clear",
"category": "ocr_backend_management",
"description": "Clear all OCR backends and verify list is empty",
"tags": [
"ocr",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_ocr_backends",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,12 @@
{
"id": "ocr_backends_list",
"category": "ocr_backend_management",
"description": "List all registered OCR backends",
"tags": ["ocr", "plugin_management", "list"],
"call": "list_ocr_backends",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,20 @@
{
"id": "ocr_backends_unregister",
"category": "ocr_backend_management",
"description": "Unregister nonexistent OCR backend gracefully",
"tags": [
"ocr",
"plugin_management",
"unregister",
"trait-bridge"
],
"call": "unregister_ocr_backend",
"input": {
"name": "nonexistent-backend-xyz"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "post_processors_clear",
"category": "post_processor_management",
"description": "Clear all post-processors and verify list is empty",
"tags": [
"post_processors",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_post_processors",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,12 @@
{
"id": "post_processors_list",
"category": "post_processor_management",
"description": "List all registered post-processors",
"tags": ["post_processors", "plugin_management", "list"],
"call": "list_post_processors",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,28 @@
{
"id": "register_document_extractor_trait_bridge",
"category": "plugin_api",
"description": "register_document_extractor: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_document_extractor",
"input": {
"extractor": {
"type": "test",
"name": "test-extractor"
}
},
"args": [
{
"name": "extractor",
"field": "extractor",
"arg_type": "test_backend",
"trait": "DocumentExtractor"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,29 @@
{
"id": "register_embedding_backend_trait_bridge",
"category": "plugin_api",
"description": "register_embedding_backend: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_embedding_backend",
"input": {
"backend": {
"type": "test",
"name": "test-embedding-backend",
"dimensions": 768
}
},
"args": [
{
"name": "backend",
"field": "backend",
"arg_type": "test_backend",
"trait": "EmbeddingBackend"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,28 @@
{
"id": "register_ocr_backend_trait_bridge",
"category": "plugin_api",
"description": "register_ocr_backend: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_ocr_backend",
"input": {
"backend": {
"type": "test",
"name": "test-backend"
}
},
"args": [
{
"name": "backend",
"field": "backend",
"arg_type": "test_backend",
"trait": "OcrBackend"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,28 @@
{
"id": "register_post_processor_trait_bridge",
"category": "plugin_api",
"description": "register_post_processor: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_post_processor",
"input": {
"processor": {
"type": "test",
"name": "test-processor"
}
},
"args": [
{
"name": "processor",
"field": "processor",
"arg_type": "test_backend",
"trait": "PostProcessor"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,28 @@
{
"id": "register_renderer_trait_bridge",
"category": "plugin_api",
"description": "register_renderer: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_renderer",
"input": {
"renderer": {
"type": "test",
"name": "test-renderer"
}
},
"args": [
{
"name": "renderer",
"field": "renderer",
"arg_type": "test_backend",
"trait": "Renderer"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,28 @@
{
"id": "register_validator_trait_bridge",
"category": "plugin_api",
"description": "register_validator: trait bridge",
"tags": [
"trait-bridge"
],
"call": "register_validator",
"input": {
"validator": {
"type": "test",
"name": "test-validator"
}
},
"args": [
{
"name": "validator",
"field": "validator",
"arg_type": "test_backend",
"trait": "Validator"
}
],
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "renderers_clear",
"category": "renderer_management",
"description": "Clear all renderers and verify list is empty",
"tags": [
"renderer",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_renderers",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,12 @@
{
"id": "renderers_list",
"category": "renderer_management",
"description": "List all registered renderers",
"tags": ["renderer", "plugin_management", "list"],
"call": "list_renderers",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "unregister_document_extractor_after_register",
"category": "plugin_api",
"description": "unregister_document_extractor",
"call": "unregister_document_extractor",
"input": {
"name": "test-extractor"
},
"assertions": [
{
"type": "not_error"
}
],
"tags": [
"plugin-lifecycle",
"trait-bridge"
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "unregister_embedding_backend_after_register",
"category": "plugin_api",
"description": "unregister_embedding_backend",
"call": "unregister_embedding_backend",
"input": {
"name": "test-embedding-backend"
},
"assertions": [
{
"type": "not_error"
}
],
"tags": [
"plugin-lifecycle",
"trait-bridge"
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "unregister_post_processor_after_register",
"category": "plugin_api",
"description": "unregister_post_processor",
"call": "unregister_post_processor",
"input": {
"name": "test-processor"
},
"assertions": [
{
"type": "not_error"
}
],
"tags": [
"plugin-lifecycle",
"trait-bridge"
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "unregister_renderer_after_register",
"category": "plugin_api",
"description": "unregister_renderer",
"call": "unregister_renderer",
"input": {
"name": "test-renderer"
},
"assertions": [
{
"type": "not_error"
}
],
"tags": [
"plugin-lifecycle",
"trait-bridge"
]
}

View File

@@ -0,0 +1,18 @@
{
"id": "unregister_validator_after_register",
"category": "plugin_api",
"description": "unregister_validator",
"call": "unregister_validator",
"input": {
"name": "test-validator"
},
"assertions": [
{
"type": "not_error"
}
],
"tags": [
"plugin-lifecycle",
"trait-bridge"
]
}

View File

@@ -0,0 +1,17 @@
{
"id": "validators_clear",
"category": "validator_management",
"description": "Clear all validators and verify list is empty",
"tags": [
"validators",
"plugin_management",
"clear",
"trait-bridge"
],
"call": "clear_validators",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,12 @@
{
"id": "validators_list",
"category": "validator_management",
"description": "List all registered validators",
"tags": ["validators", "plugin_management", "list"],
"call": "list_validators",
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,15 @@
{
"id": "extensions_docx",
"category": "registry_operations",
"description": "Get file extensions for DOCX MIME type",
"tags": ["registry", "extensions", "docx"],
"call": "get_extensions_for_mime",
"input": {
"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,15 @@
{
"id": "extensions_html",
"category": "registry_operations",
"description": "Get file extensions for HTML MIME type",
"tags": ["registry", "extensions", "html"],
"call": "get_extensions_for_mime",
"input": {
"mime_type": "text/html"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,15 @@
{
"id": "extensions_pdf",
"category": "registry_operations",
"description": "Get file extensions for PDF MIME type",
"tags": ["registry", "extensions", "pdf"],
"call": "get_extensions_for_mime",
"input": {
"mime_type": "application/pdf"
},
"assertions": [
{
"type": "not_error"
}
]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_document_extractors",
"category": "registry",
"description": "List document extractors",
"tags": ["registry"],
"call": "list_document_extractors",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_embedding_backends",
"category": "registry",
"description": "List embedding backends",
"tags": ["registry"],
"call": "list_embedding_backends",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_ocr_backends",
"category": "registry",
"description": "List OCR backends",
"tags": ["registry"],
"call": "list_ocr_backends",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_post_processors",
"category": "registry",
"description": "List post-processors",
"tags": ["registry"],
"call": "list_post_processors",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_renderers",
"category": "registry",
"description": "List renderers",
"tags": ["registry"],
"call": "list_renderers",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,9 @@
{
"id": "list_validators",
"category": "registry",
"description": "List validators",
"tags": ["registry"],
"call": "list_validators",
"input": {},
"assertions": [{ "type": "not_error" }]
}

View File

@@ -0,0 +1,32 @@
{
"id": "smoke_docx_basic",
"category": "smoke",
"description": "Smoke test: DOCX with formatted text",
"tags": ["smoke", "office", "docx"],
"input": {
"path": "docx/fake.docx",
"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"config": {}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
{
"type": "min_length",
"field": "content",
"value": 20
},
{
"type": "contains_any",
"field": "content",
"values": ["Lorem", "ipsum", "document", "text"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,32 @@
{
"id": "smoke_html_basic",
"category": "smoke",
"description": "Smoke test: HTML table extraction",
"tags": ["smoke", "html"],
"input": {
"path": "html/simple_table.html",
"mime_type": "text/html",
"config": {}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "text/html"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["Sample Data Table", "Laptop", "Electronics", "Product"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

Some files were not shown because too many files have changed in this diff Show More