This commit is contained in:
15
tools/benchmark-harness/fixtures/md/2203.01017v2.json
Normal file
15
tools/benchmark-harness/fixtures/md/2203.01017v2.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/2203.01017v2.md",
|
||||
"file_type": "md",
|
||||
"file_size": 54216,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/2203.01017v2.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/2206.01062.json
Normal file
15
tools/benchmark-harness/fixtures/md/2206.01062.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/2206.01062.md",
|
||||
"file_type": "md",
|
||||
"file_size": 51516,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/2206.01062.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/2305.03393v1-pg9.json
Normal file
15
tools/benchmark-harness/fixtures/md/2305.03393v1-pg9.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/2305.03393v1-pg9.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2830,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/2305.03393v1-pg9.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/2305.03393v1.json
Normal file
15
tools/benchmark-harness/fixtures/md/2305.03393v1.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/2305.03393v1.md",
|
||||
"file_type": "md",
|
||||
"file_size": 32105,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/2305.03393v1.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/amt_handbook_sample.json
Normal file
15
tools/benchmark-harness/fixtures/md/amt_handbook_sample.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/amt_handbook_sample.md",
|
||||
"file_type": "md",
|
||||
"file_size": 3620,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/amt_handbook_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/blocks.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/blocks.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/blocks.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 388,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/blocks.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/code_and_formula.json
Normal file
15
tools/benchmark-harness/fixtures/md/code_and_formula.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/code_and_formula.md",
|
||||
"file_type": "md",
|
||||
"file_size": 5538,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/code_and_formula.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-comma-in-cell.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 156,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-comma-in-cell.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/csv-comma.csv.json
Normal file
15
tools/benchmark-harness/fixtures/md/csv-comma.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-comma.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1911,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-comma.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-inconsistent-header.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 150,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-inconsistent-header.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/csv-pipe.csv.json
Normal file
15
tools/benchmark-harness/fixtures/md/csv-pipe.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-pipe.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1939,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-pipe.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/csv-semicolon.csv.json
Normal file
15
tools/benchmark-harness/fixtures/md/csv-semicolon.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-semicolon.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1904,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-semicolon.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/csv-tab.csv.json
Normal file
15
tools/benchmark-harness/fixtures/md/csv-tab.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-tab.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1854,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-tab.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-too-few-columns.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 156,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-too-few-columns.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/csv-too-many-columns.csv.md",
|
||||
"file_type": "md",
|
||||
"file_size": 186,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/csv-too-many-columns.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/deepseek_example.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/deepseek_example.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/deepseek_example.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 3039,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/deepseek_example.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/deepseek_simple.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/deepseek_simple.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/deepseek_simple.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1342,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/deepseek_simple.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/deepseek_title.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/deepseek_title.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/deepseek_title.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2950,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/deepseek_title.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/docx_grouped_images.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 335,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/docx_grouped_images.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/docx_rich_cells.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2574,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/docx_rich_cells.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/drawingml.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/drawingml.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/drawingml.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 47,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/drawingml.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/duck.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/duck.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/duck.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1041,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/duck.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/elife-56337.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/md/elife-56337.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/elife-56337.nxml.md",
|
||||
"file_type": "md",
|
||||
"file_size": 77781,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/elife-56337.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/ending_with_table.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 462,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/ending_with_table.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/equations.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/equations.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/equations.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2267,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/equations.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/escaped_characters.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 729,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/escaped_characters.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_01.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 358,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_01_images.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 358,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_01_images.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_02.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 241,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_03.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 611,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_04.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 484,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_05.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 475,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_06.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_06.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_06.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 245,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_06.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_07.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_07.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_07.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 202,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_07.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/example_08.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/example_08.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/example_08.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1174,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/example_08.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/formatting.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/formatting.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/formatting.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 934,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/formatting.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/html_code_snippets.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1051,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/html_code_snippets.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/html_heading_in_p.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 265,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/html_heading_in_p.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/html_rich_table_cells.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 3000,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/html_rich_table_cells.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/hyperlink_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/hyperlink_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/hyperlink_01.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 57,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/hyperlink_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/hyperlink_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/hyperlink_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/hyperlink_02.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 22,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/hyperlink_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/hyperlink_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/hyperlink_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/hyperlink_03.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 303,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/hyperlink_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/hyperlink_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/hyperlink_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/hyperlink_04.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 34,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/hyperlink_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/hyperlink_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/hyperlink_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/hyperlink_05.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 152,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/hyperlink_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/inline_and_formatting.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 941,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/inline_and_formatting.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/ipa20180000016.json
Normal file
15
tools/benchmark-harness/fixtures/md/ipa20180000016.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/ipa20180000016.md",
|
||||
"file_type": "md",
|
||||
"file_size": 67171,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/ipa20180000016.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/ipa20200022300.json
Normal file
15
tools/benchmark-harness/fixtures/md/ipa20200022300.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/ipa20200022300.md",
|
||||
"file_type": "md",
|
||||
"file_size": 48801,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/ipa20200022300.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/list_after_num_headers.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 164,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/list_after_num_headers.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/lorem_ipsum.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/lorem_ipsum.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/lorem_ipsum.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 3487,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/lorem_ipsum.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/mixed.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/mixed.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/mixed.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 610,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/mixed.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/mixed_without_h1.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/mixed_without_h1.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/mixed_without_h1.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 108,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/mixed_without_h1.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/multi_page.json
Normal file
15
tools/benchmark-harness/fixtures/md/multi_page.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/multi_page.md",
|
||||
"file_type": "md",
|
||||
"file_size": 9393,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/multi_page.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/nested.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/nested.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/nested.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 477,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/nested.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/pa20010031492.json
Normal file
15
tools/benchmark-harness/fixtures/md/pa20010031492.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/pa20010031492.md",
|
||||
"file_type": "md",
|
||||
"file_size": 26311,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/pa20010031492.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/pftaps057006474.json
Normal file
15
tools/benchmark-harness/fixtures/md/pftaps057006474.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/pftaps057006474.md",
|
||||
"file_type": "md",
|
||||
"file_size": 25649,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/pftaps057006474.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/pg06442728.json
Normal file
15
tools/benchmark-harness/fixtures/md/pg06442728.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/pg06442728.md",
|
||||
"file_type": "md",
|
||||
"file_size": 29728,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/pg06442728.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/picture_classification.md",
|
||||
"file_type": "md",
|
||||
"file_size": 3458,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/picture_classification.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/pntd.0008301.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/md/pntd.0008301.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/pntd.0008301.nxml.md",
|
||||
"file_type": "md",
|
||||
"file_size": 55251,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/pntd.0008301.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/pone.0234687.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/md/pone.0234687.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/pone.0234687.nxml.md",
|
||||
"file_type": "md",
|
||||
"file_size": 69786,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/pone.0234687.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/powerpoint_bad_text.pptx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 121,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/powerpoint_bad_text.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/powerpoint_issue_2663.pptx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 635,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/powerpoint_issue_2663.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/powerpoint_sample.pptx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1184,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/powerpoint_sample.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/powerpoint_with_image.pptx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 60,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/powerpoint_with_image.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/redp5110_sampled.json
Normal file
15
tools/benchmark-harness/fixtures/md/redp5110_sampled.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/redp5110_sampled.md",
|
||||
"file_type": "md",
|
||||
"file_size": 40112,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/redp5110_sampled.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/right_to_left_01.json
Normal file
15
tools/benchmark-harness/fixtures/md/right_to_left_01.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/right_to_left_01.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2514,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/right_to_left_01.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/right_to_left_02.json
Normal file
15
tools/benchmark-harness/fixtures/md/right_to_left_02.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/right_to_left_02.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2004,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/right_to_left_02.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/right_to_left_03.json
Normal file
15
tools/benchmark-harness/fixtures/md/right_to_left_03.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/right_to_left_03.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1359,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/right_to_left_03.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_01.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 101,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_02.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 239,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_03.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 206,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_04.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 275,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_05.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 293,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/table_06.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/table_06.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_06.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2858,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_06.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_with_equations.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 240,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_with_equations.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_with_heading_01.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 83,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_with_heading_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/table_with_heading_02.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 139,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/table_with_heading_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/tablecell.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/tablecell.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/tablecell.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 176,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/tablecell.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/test_01.asciidoc.json
Normal file
15
tools/benchmark-harness/fixtures/md/test_01.asciidoc.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/test_01.asciidoc.md",
|
||||
"file_type": "md",
|
||||
"file_size": 376,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/test_01.asciidoc.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/test_02.asciidoc.json
Normal file
15
tools/benchmark-harness/fixtures/md/test_02.asciidoc.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/test_02.asciidoc.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1987,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/test_02.asciidoc.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/test_03.asciidoc.json
Normal file
15
tools/benchmark-harness/fixtures/md/test_03.asciidoc.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/test_03.asciidoc.md",
|
||||
"file_type": "md",
|
||||
"file_size": 646,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/test_03.asciidoc.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/test_emf_docx.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/test_emf_docx.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/test_emf_docx.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 139,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/test_emf_docx.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/textbox.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/textbox.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/textbox.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1959,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/textbox.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/unit_test_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/unit_test_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/unit_test_01.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 107,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/unit_test_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/unit_test_formatting.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 500,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/unit_test_formatting.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/unit_test_headers.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 373,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/unit_test_headers.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/unit_test_headers_numbered.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 401,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/unit_test_headers_numbered.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/unit_test_lists.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/unit_test_lists.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/webvtt_example_01.vtt.md",
|
||||
"file_type": "md",
|
||||
"file_size": 515,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/webvtt_example_01.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/webvtt_example_02.vtt.md",
|
||||
"file_type": "md",
|
||||
"file_size": 117,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/webvtt_example_02.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/webvtt_example_03.vtt.md",
|
||||
"file_type": "md",
|
||||
"file_size": 348,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/webvtt_example_03.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/webvtt_example_04.vtt.md",
|
||||
"file_type": "md",
|
||||
"file_size": 272,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/webvtt_example_04.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/wiki.md.json
Normal file
15
tools/benchmark-harness/fixtures/md/wiki.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/wiki.md.md",
|
||||
"file_type": "md",
|
||||
"file_size": 4790,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/wiki.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/wiki_duck.html.json
Normal file
15
tools/benchmark-harness/fixtures/md/wiki_duck.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/wiki_duck.html.md",
|
||||
"file_type": "md",
|
||||
"file_size": 59879,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/wiki_duck.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/word_comments.json
Normal file
15
tools/benchmark-harness/fixtures/md/word_comments.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/word_comments.md",
|
||||
"file_type": "md",
|
||||
"file_size": 240,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/word_comments.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/word_image_anchors.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 175,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/word_image_anchors.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/word_sample.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/word_sample.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/word_sample.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1003,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/word_sample.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/word_sample.json
Normal file
15
tools/benchmark-harness/fixtures/md/word_sample.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/word_sample.md",
|
||||
"file_type": "md",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/word_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/word_tables.docx.json
Normal file
15
tools/benchmark-harness/fixtures/md/word_tables.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/word_tables.docx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 2117,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/word_tables.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/md/xlsx_01.xlsx.json
Normal file
15
tools/benchmark-harness/fixtures/md/xlsx_01.xlsx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/xlsx_01.xlsx.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1753,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/xlsx_01.xlsx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/md/xlsx_02_sample_sales_data.xlsm.md",
|
||||
"file_type": "md",
|
||||
"file_size": 1342,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/md/xlsx_02_sample_sales_data.xlsm.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user