This commit is contained in:
32
fixtures/smoke/docx_basic.json
Normal file
32
fixtures/smoke/docx_basic.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"id": "smoke_docx_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: DOCX with formatted text",
|
||||
"tags": ["smoke", "office", "docx"],
|
||||
"input": {
|
||||
"path": "docx/fake.docx",
|
||||
"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 20
|
||||
},
|
||||
{
|
||||
"type": "contains_any",
|
||||
"field": "content",
|
||||
"values": ["Lorem", "ipsum", "document", "text"]
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
32
fixtures/smoke/html_basic.json
Normal file
32
fixtures/smoke/html_basic.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"id": "smoke_html_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: HTML table extraction",
|
||||
"tags": ["smoke", "html"],
|
||||
"input": {
|
||||
"path": "html/simple_table.html",
|
||||
"mime_type": "text/html",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "text/html"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"type": "contains_any",
|
||||
"field": "content",
|
||||
"values": ["Sample Data Table", "Laptop", "Electronics", "Product"]
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
24
fixtures/smoke/image_png.json
Normal file
24
fixtures/smoke/image_png.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"id": "smoke_image_png",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: PNG image (without OCR, metadata only)",
|
||||
"tags": ["smoke", "image", "png"],
|
||||
"input": {
|
||||
"path": "images/sample.png",
|
||||
"media_type": "image/png",
|
||||
"config": {
|
||||
"disable_ocr": true
|
||||
}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "image/png"
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
27
fixtures/smoke/json_basic.json
Normal file
27
fixtures/smoke/json_basic.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"id": "smoke_json_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: JSON file extraction",
|
||||
"tags": ["smoke", "structured", "json"],
|
||||
"input": {
|
||||
"path": "json/simple.json",
|
||||
"mime_type": "application/json",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "application/json"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 5
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
29
fixtures/smoke/ocr_image_png.json
Normal file
29
fixtures/smoke/ocr_image_png.json
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"id": "ocr_image_png",
|
||||
"category": "smoke",
|
||||
"description": "OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.",
|
||||
"tags": ["smoke", "ocr", "image", "png"],
|
||||
"call": "extract_bytes",
|
||||
"input": {
|
||||
"data": "images/test_hello_world.png",
|
||||
"mime_type": "image/png",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "image/png"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"type": "contains_any",
|
||||
"field": "content",
|
||||
"values": ["Hello", "World", "hello", "world"]
|
||||
}
|
||||
]
|
||||
}
|
||||
32
fixtures/smoke/pdf_basic.json
Normal file
32
fixtures/smoke/pdf_basic.json
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
"id": "smoke_pdf_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: PDF with simple text extraction",
|
||||
"tags": ["smoke", "pdf", "basic"],
|
||||
"input": {
|
||||
"path": "pdf/fake_memo.pdf",
|
||||
"mime_type": "application/pdf",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "application/pdf"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"type": "contains_any",
|
||||
"field": "content",
|
||||
"values": ["May 5, 2023", "To Whom it May Concern"]
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
27
fixtures/smoke/txt_basic.json
Normal file
27
fixtures/smoke/txt_basic.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"id": "smoke_txt_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: Plain text file",
|
||||
"tags": ["smoke", "text", "plaintext"],
|
||||
"input": {
|
||||
"path": "text/report.txt",
|
||||
"mime_type": "text/plain",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "text/plain"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 5
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
57
fixtures/smoke/xlsx_basic.json
Normal file
57
fixtures/smoke/xlsx_basic.json
Normal file
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"id": "smoke_xlsx_basic",
|
||||
"category": "smoke",
|
||||
"description": "Smoke test: XLSX with basic spreadsheet data including tables",
|
||||
"tags": ["smoke", "office", "xlsx", "tables"],
|
||||
"input": {
|
||||
"path": "xlsx/stanley_cups.xlsx",
|
||||
"mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"config": {}
|
||||
},
|
||||
"assertions": [
|
||||
{
|
||||
"type": "equals",
|
||||
"field": "mime_type",
|
||||
"value": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
},
|
||||
{
|
||||
"type": "min_length",
|
||||
"field": "content",
|
||||
"value": 100
|
||||
},
|
||||
{
|
||||
"type": "contains_all",
|
||||
"field": "content",
|
||||
"values": [
|
||||
"Team",
|
||||
"Location",
|
||||
"Stanley Cups",
|
||||
"Blues",
|
||||
"Flyers",
|
||||
"Maple Leafs",
|
||||
"STL",
|
||||
"PHI",
|
||||
"TOR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "count_min",
|
||||
"field": "tables",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"type": "greater_than_or_equal",
|
||||
"field": "metadata.format.excel.sheet_count",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"type": "contains_all",
|
||||
"field": "metadata.format.excel.sheet_names",
|
||||
"values": ["Stanley Cups"]
|
||||
}
|
||||
],
|
||||
"skip": {
|
||||
"languages": ["wasm"],
|
||||
"reason": "WASM cannot access filesystem; use extractBytes with file content instead"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user