Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
{
"id": "api_batch_bytes_async",
"description": "Tests async batch bytes extraction API (batch_extract_bytes)",
"tags": ["contract", "api", "batch"],
"call": "extract_file",
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
},
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
]
}

View File

@@ -0,0 +1,33 @@
{
"id": "api_batch_bytes_with_configs_async",
"description": "Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)",
"tags": ["contract", "api", "batch", "file_config"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "api_batch_file_async",
"description": "Tests async batch file extraction API (batch_extract_file)",
"tags": ["contract", "api", "batch"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,33 @@
{
"id": "api_batch_file_with_configs_async",
"description": "Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)",
"tags": ["contract", "api", "batch", "file_config"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "api_extract_bytes_async",
"description": "Tests async bytes extraction API (extract_bytes)",
"tags": ["contract", "api"],
"call": "extract_file",
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,29 @@
{
"id": "api_extract_file_async",
"description": "Tests async file extraction API (extract_file)",
"tags": ["contract", "api"],
"input": {
"path": "pdf/fake_memo.pdf"
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": ["May 5, 2023", "Mallori"]
}
],
"skip": {
"languages": ["wasm"],
"reason": "WASM uses synchronous-only API; async extraction is not available on the wasm target"
}
}

View File

@@ -0,0 +1,52 @@
{
"id": "config_chunking_prepend_heading_context",
"description": "Tests markdown chunker prepends heading hierarchy to chunk content",
"tags": [
"contract",
"config",
"chunking",
"heading-context"
],
"call": "extract_file_sync",
"input": {
"path": "markdown/extraction_test.md",
"config": {
"chunking": {
"chunker_type": "markdown",
"max_chars": 300,
"max_overlap": 50,
"prepend_heading_context": true
}
}
},
"assertions": [
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "count_min",
"field": "chunks",
"value": 2
},
{
"type": "is_true",
"field": "chunks_have_content"
},
{
"type": "is_true",
"field": "chunks_have_heading_context"
},
{
"type": "is_true",
"field": "first_chunk_starts_with_heading"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,37 @@
{
"id": "config_document_structure_with_headings",
"description": "Tests document structure with DOCX heading-driven nesting",
"tags": [
"contract",
"document_structure"
],
"call": "extract_file_sync",
"input": {
"path": "docx/fake.docx",
"config": {
"include_document_structure": true
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
{
"type": "not_empty",
"field": "document"
},
{
"type": "count_min",
"field": "document.nodes",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,36 @@
{
"id": "config_element_types",
"description": "Tests element-based result format with element type assertions on DOCX",
"tags": [
"contract",
"config",
"result_format"
],
"call": "extract_file_sync",
"input": {
"path": "docx/unit_test_headers.docx",
"config": {
"result_format": "element_based"
}
},
"assertions": [
{
"type": "contains_any",
"field": "mime_type",
"values": [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
},
{
"type": "count_min",
"field": "elements",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,32 @@
{
"id": "config_embedding_plugin",
"description": "Tests EmbeddingModelType::Plugin variant deserialization in ChunkingConfig — config accepts the plugin variant shape; actual dispatch requires a host-language backend registered via register_embedding_backend at runtime",
"tags": ["contract", "config", "embeddings", "plugin"],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"chunking": {
"max_chars": 500,
"max_overlap": 50,
"embedding": {
"model": {
"type": "plugin",
"name": "test-plugin-backend"
},
"normalize": true,
"max_embed_duration_secs": 30
}
}
}
},
"assertions": [
{
"type": "not_error"
}
],
"skip": {
"languages": ["python", "rust", "node", "go", "ruby", "elixir", "wasm", "java", "csharp", "php", "r", "dart", "kotlin_android", "swift", "zig"],
"reason": "EmbeddingModelType::Plugin requires a host-language backend registered via register_embedding_backend before dispatch; the e2e harness cannot register one. This fixture validates config round-trip (the {\"type\":\"plugin\",\"name\":...} shape is accepted by every binding's EmbeddingConfig)."
}
}

View File

@@ -0,0 +1,34 @@
{
"id": "config_extraction_timeout",
"description": "Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions",
"tags": [
"contract",
"config",
"timeout"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"extraction_timeout_secs": 300
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,46 @@
{
"id": "config_keywords",
"description": "Tests keyword extraction via YAKE algorithm",
"tags": [
"contract",
"config",
"keywords"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"keywords": {
"algorithm": "yake",
"max_keywords": 10
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "keywords"
},
{
"type": "count_min",
"field": "keywords",
"value": 1
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,52 @@
{
"id": "config_llm_structured_extraction",
"description": "Tests structured extraction via liter-llm with JSON schema",
"tags": ["contract", "config", "liter-llm", "structured-extraction"],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"structured_extraction": {
"schema": {
"type": "object",
"properties": {
"title": {
"type": "string"
},
"date": {
"type": "string"
},
"summary": {
"type": "string"
}
},
"required": ["title"]
},
"schema_name": "memo_data",
"llm": {
"model": "openai/gpt-4o"
}
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "structured_output"
}
],
"skip": {
"languages": ["python", "rust", "node", "go", "ruby", "elixir", "wasm", "java", "csharp", "php", "r", "dart", "kotlin_android", "swift", "zig"],
"reason": "Requires liter-llm feature and KREUZBERG_LLM_API_KEY env var; runtime-only skip"
}
}

View File

@@ -0,0 +1,43 @@
{
"id": "config_pages",
"description": "Tests page extraction and page marker configuration",
"tags": [
"contract",
"config"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"pages": {
"extract_pages": true,
"insert_page_markers": true
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "contains_any",
"field": "content",
"values": [
"PAGE"
]
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,48 @@
{
"id": "config_quality_enabled",
"description": "Tests quality scoring produces a score value in [0.0, 1.0]",
"tags": [
"contract",
"config",
"quality"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"enable_quality_processing": true
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "not_empty",
"field": "quality_score"
},
{
"type": "greater_than_or_equal",
"field": "quality_score",
"value": 0.0
},
{
"type": "less_than_or_equal",
"field": "quality_score",
"value": 1.0
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,41 @@
{
"id": "config_security_limits",
"description": "Tests archive extraction with custom security limits",
"tags": [
"contract",
"config",
"security"
],
"call": "extract_file_sync",
"input": {
"path": "archives/documents.zip",
"config": {
"security_limits": {
"max_archive_size": 104857600,
"max_compression_ratio": 50,
"max_files_in_archive": 100
}
}
},
"assertions": [
{
"type": "contains_any",
"field": "mime_type",
"values": [
"application/zip",
"application/x-zip-compressed"
]
},
{
"type": "min_length",
"field": "content",
"value": 10
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,51 @@
{
"id": "config_tree_sitter",
"description": "Tests tree-sitter configuration round-trip",
"tags": [
"contract",
"config",
"tree-sitter"
],
"call": "extract_file_sync",
"input": {
"path": "code/hello.py",
"config": {
"tree_sitter": {
"languages": [
"python",
"rust"
],
"groups": [
"web"
],
"process": {
"structure": true,
"imports": true,
"exports": true,
"comments": false,
"docstrings": false,
"symbols": false,
"diagnostics": false
}
}
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "text/x-source-code"
},
{
"type": "min_length",
"field": "content",
"value": 5
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}

View File

@@ -0,0 +1,30 @@
{
"id": "output_format_bytes_markdown",
"description": "Tests markdown output format via bytes extraction API",
"tags": ["contract", "output_format", "bytes"],
"call": "extract_bytes_sync",
"input": {
"data": "pdf/fake_memo.pdf",
"mime_type": "application/pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
]
}

View File

@@ -0,0 +1,38 @@
{
"id": "output_format_markdown",
"description": "Tests Markdown output format",
"tags": [
"contract",
"output_format"
],
"call": "extract_file_sync",
"input": {
"path": "pdf/fake_memo.pdf",
"config": {
"output_format": "markdown"
}
},
"assertions": [
{
"type": "equals",
"field": "mime_type",
"value": "application/pdf"
},
{
"type": "min_length",
"field": "content",
"value": 10
},
{
"type": "equals",
"field": "metadata.output_format",
"value": "markdown"
}
],
"skip": {
"languages": [
"wasm"
],
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
}
}