This commit is contained in:
16
tools/benchmark-harness/fixtures/xlsx/excel_multi_sheet.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/excel_multi_sheet.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/xlsx/excel_multi_sheet.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 6166,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "xlsx test: excel_multi_sheet",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/excel_multi_sheet.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/excel_multi_sheet.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/xlsx/stanley_cups.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/stanley_cups.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/xlsx/stanley_cups.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 6339,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "xlsx test: stanley_cups",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/stanley_cups.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/stanley_cups.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/xlsx/test.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/xlsx/test.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 11562,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/test.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/test.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/xlsx/test_01.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/test_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/xlsx/test_01.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 170052,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "xlsx test: test_01",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/test_01.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/test_01.txt"
|
||||
}
|
||||
}
|
||||
BIN
tools/benchmark-harness/fixtures/xlsx/test_01.xlsm
Normal file
BIN
tools/benchmark-harness/fixtures/xlsx/test_01.xlsm
Normal file
Binary file not shown.
BIN
tools/benchmark-harness/fixtures/xlsx/test_xla.xla
Normal file
BIN
tools/benchmark-harness/fixtures/xlsx/test_xla.xla
Normal file
Binary file not shown.
BIN
tools/benchmark-harness/fixtures/xlsx/test_xlam.xlam
Normal file
BIN
tools/benchmark-harness/fixtures/xlsx/test_xlam.xlam
Normal file
Binary file not shown.
BIN
tools/benchmark-harness/fixtures/xlsx/test_xlsb.xlsb
Normal file
BIN
tools/benchmark-harness/fixtures/xlsx/test_xlsb.xlsb
Normal file
Binary file not shown.
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"document": "../../../../test_documents/xlsx/test_01.xlsm",
|
||||
"file_type": "xlsm",
|
||||
"file_size": 162186,
|
||||
"expected_frameworks": ["kreuzberg", "tika"],
|
||||
"metadata": {
|
||||
"description": "Excel macro-enabled workbook (.xlsm format) - converted from test_01.xlsx",
|
||||
"category": "structured",
|
||||
"size_category": "medium",
|
||||
"excel_variant": "macro-enabled",
|
||||
"notes": "XLSM files support VBA macros and advanced Excel features"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsm/xlsm_macro_enabled.txt",
|
||||
"source": "openpyxl"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/xlsx/xlsx_01.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/xlsx_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_01.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 170934,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_01.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_03_chartsheet.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 10491,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_03_chartsheet.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_03_chartsheet.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/xlsx/xlsx_04_inflated.json
Normal file
16
tools/benchmark-harness/fixtures/xlsx/xlsx_04_inflated.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_04_inflated.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 171916,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_04_inflated.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_04_inflated.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_05_table_with_title.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 6335,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_05_table_with_title.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_05_table_with_title.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_06_edge_cases_.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 9504,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_06_edge_cases_.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_06_edge_cases_.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/xlsx/xlsx_07_gap_tolerance_.xlsx",
|
||||
"file_type": "xlsx",
|
||||
"file_size": 16217,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/xlsx/xlsx_07_gap_tolerance_.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/xlsx/xlsx_07_gap_tolerance_.md"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user