This commit is contained in:
15
tools/benchmark-harness/fixtures/json/2203.01017v2.json
Normal file
15
tools/benchmark-harness/fixtures/json/2203.01017v2.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2203.01017v2.json",
|
||||
"file_type": "json",
|
||||
"file_size": 663249,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2203.01017v2.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2203.01017v2.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 477,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2203.01017v2.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/2206.01062.json
Normal file
15
tools/benchmark-harness/fixtures/json/2206.01062.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2206.01062.json",
|
||||
"file_type": "json",
|
||||
"file_size": 939124,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2206.01062.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2206.01062.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 272,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2206.01062.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/2305.03393v1-pg9.json
Normal file
15
tools/benchmark-harness/fixtures/json/2305.03393v1-pg9.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2305.03393v1-pg9.json",
|
||||
"file_type": "json",
|
||||
"file_size": 64383,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2305.03393v1-pg9.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2305.03393v1-pg9.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 33,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2305.03393v1-pg9.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/2305.03393v1.json
Normal file
15
tools/benchmark-harness/fixtures/json/2305.03393v1.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2305.03393v1.json",
|
||||
"file_type": "json",
|
||||
"file_size": 403019,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2305.03393v1.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/2305.03393v1.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 412,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/2305.03393v1.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/amt_handbook_sample.json",
|
||||
"file_type": "json",
|
||||
"file_size": 24952,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/amt_handbook_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/amt_handbook_sample.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 33,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/amt_handbook_sample.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/code_and_formula.json
Normal file
15
tools/benchmark-harness/fixtures/json/code_and_formula.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/code_and_formula.json",
|
||||
"file_type": "json",
|
||||
"file_size": 21078,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/code_and_formula.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/code_and_formula.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 61,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/code_and_formula.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-comma-in-cell.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 17304,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-comma-in-cell.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/csv-comma.csv.json
Normal file
15
tools/benchmark-harness/fixtures/json/csv-comma.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-comma.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 60861,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-comma.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-inconsistent-header.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 16927,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-inconsistent-header.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/csv-pipe.csv.json
Normal file
15
tools/benchmark-harness/fixtures/json/csv-pipe.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-pipe.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 60848,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-pipe.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/csv-semicolon.csv.json
Normal file
15
tools/benchmark-harness/fixtures/json/csv-semicolon.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-semicolon.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 60859,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-semicolon.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/csv-tab.csv.json
Normal file
15
tools/benchmark-harness/fixtures/json/csv-tab.csv.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-tab.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 60850,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-tab.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-too-few-columns.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 16921,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-too-few-columns.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/csv-too-many-columns.csv.json",
|
||||
"file_type": "json",
|
||||
"file_size": 19770,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/csv-too-many-columns.csv.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/deepseek_example.md.json",
|
||||
"file_type": "json",
|
||||
"file_size": 71809,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/deepseek_example.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/deepseek_simple.md.json",
|
||||
"file_type": "json",
|
||||
"file_size": 10372,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/deepseek_simple.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/deepseek_title.md.json
Normal file
15
tools/benchmark-harness/fixtures/json/deepseek_title.md.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/deepseek_title.md.json",
|
||||
"file_type": "json",
|
||||
"file_size": 19823,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/deepseek_title.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/docx_grouped_images.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 152115,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/docx_grouped_images.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/docx_rich_cells.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 82392,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/docx_rich_cells.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/drawingml.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/drawingml.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/drawingml.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 88806,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/drawingml.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/elife-56337.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/json/elife-56337.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/elife-56337.nxml.json",
|
||||
"file_type": "json",
|
||||
"file_size": 352303,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/elife-56337.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/equations.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/equations.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/equations.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 20636,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/equations.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/escaped_characters.md.json",
|
||||
"file_type": "json",
|
||||
"file_size": 19472,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/escaped_characters.md.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_01.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 5851,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_01_images.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 717612,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_01_images.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_02.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 4056,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_03.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 17188,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_04.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 10366,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_05.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 10348,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_06.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_06.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_06.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 4814,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_06.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_07.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_07.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_07.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 7263,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_07.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/example_08.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/example_08.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/example_08.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 64838,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/example_08.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/formatting.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/formatting.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/formatting.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 25210,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/formatting.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/html_code_snippets.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 14947,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/html_code_snippets.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/html_heading_in_p.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 20611,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/html_heading_in_p.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/html_rich_table_cells.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 63348,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/html_rich_table_cells.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/hyperlink_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/hyperlink_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/hyperlink_01.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 2131,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/hyperlink_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/hyperlink_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/hyperlink_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/hyperlink_02.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 1951,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/hyperlink_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/hyperlink_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/hyperlink_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/hyperlink_03.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 6996,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/hyperlink_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/hyperlink_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/hyperlink_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/hyperlink_04.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 980,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/hyperlink_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/hyperlink_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/hyperlink_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/hyperlink_05.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 3265,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/hyperlink_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/ipa20180000016.json
Normal file
15
tools/benchmark-harness/fixtures/json/ipa20180000016.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/ipa20180000016.json",
|
||||
"file_type": "json",
|
||||
"file_size": 291123,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/ipa20180000016.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/ipa20200022300.json
Normal file
15
tools/benchmark-harness/fixtures/json/ipa20200022300.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/ipa20200022300.json",
|
||||
"file_type": "json",
|
||||
"file_size": 120441,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/ipa20200022300.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/list_after_num_headers.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 3681,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/list_after_num_headers.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/lorem_ipsum.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/lorem_ipsum.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/lorem_ipsum.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 10926,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/lorem_ipsum.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/multi_page.json
Normal file
15
tools/benchmark-harness/fixtures/json/multi_page.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/multi_page.json",
|
||||
"file_type": "json",
|
||||
"file_size": 53493,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/multi_page.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/multi_page.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 147,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/multi_page.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/pa20010031492.json
Normal file
15
tools/benchmark-harness/fixtures/json/pa20010031492.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/pa20010031492.json",
|
||||
"file_type": "json",
|
||||
"file_size": 99911,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/pa20010031492.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/pftaps057006474.json
Normal file
15
tools/benchmark-harness/fixtures/json/pftaps057006474.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/pftaps057006474.json",
|
||||
"file_type": "json",
|
||||
"file_size": 68409,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/pftaps057006474.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/pg06442728.json
Normal file
15
tools/benchmark-harness/fixtures/json/pg06442728.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/pg06442728.json",
|
||||
"file_type": "json",
|
||||
"file_size": 93078,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/pg06442728.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/picture_classification.json",
|
||||
"file_type": "json",
|
||||
"file_size": 14219,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/picture_classification.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/picture_classification.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 61,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/picture_classification.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/pntd.0008301.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/json/pntd.0008301.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/pntd.0008301.nxml.json",
|
||||
"file_type": "json",
|
||||
"file_size": 311704,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/pntd.0008301.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/pone.0234687.nxml.json
Normal file
15
tools/benchmark-harness/fixtures/json/pone.0234687.nxml.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/pone.0234687.nxml.json",
|
||||
"file_type": "json",
|
||||
"file_size": 559991,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/pone.0234687.nxml.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/powerpoint_bad_text.pptx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 1869,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/powerpoint_bad_text.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/powerpoint_issue_2663.pptx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 8989,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/powerpoint_issue_2663.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/powerpoint_sample.pptx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 61244,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/powerpoint_sample.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/powerpoint_with_image.pptx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 56295,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/powerpoint_with_image.pptx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/redp5110_sampled.json
Normal file
15
tools/benchmark-harness/fixtures/json/redp5110_sampled.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/redp5110_sampled.json",
|
||||
"file_type": "json",
|
||||
"file_size": 465656,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/redp5110_sampled.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/redp5110_sampled.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 524,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/redp5110_sampled.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/right_to_left_01.json
Normal file
15
tools/benchmark-harness/fixtures/json/right_to_left_01.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_01.json",
|
||||
"file_type": "json",
|
||||
"file_size": 15772,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_01.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_01.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 32,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_01.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/right_to_left_02.json
Normal file
15
tools/benchmark-harness/fixtures/json/right_to_left_02.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_02.json",
|
||||
"file_type": "json",
|
||||
"file_size": 15601,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_02.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_02.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 32,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_02.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/right_to_left_03.json
Normal file
15
tools/benchmark-harness/fixtures/json/right_to_left_03.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_03.json",
|
||||
"file_type": "json",
|
||||
"file_size": 27176,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_03.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/right_to_left_03.pages.meta.json",
|
||||
"file_type": "json",
|
||||
"file_size": 32,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/right_to_left_03.pages.meta.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_01.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 5307,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_02.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_02.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_02.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 6691,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_03.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_03.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_03.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 7038,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_03.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_04.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_04.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_04.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 7442,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_04.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_05.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_05.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_05.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 11053,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_05.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/table_06.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/table_06.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_06.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 22830,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_06.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_with_equations.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 4943,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_with_equations.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_with_heading_01.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 5001,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_with_heading_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/table_with_heading_02.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 6633,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/table_with_heading_02.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/tablecell.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/tablecell.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/tablecell.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 11513,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/tablecell.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/test.json
Normal file
15
tools/benchmark-harness/fixtures/json/test.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/json/test.json",
|
||||
"file_type": "json",
|
||||
"file_size": 229,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/test.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/test_emf_docx.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 137152,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/test_emf_docx.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/textbox.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/textbox.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/textbox.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 31501,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/textbox.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/unit_test_01.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/unit_test_01.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/unit_test_01.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 3174,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/unit_test_01.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/unit_test_formatting.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 21104,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/unit_test_formatting.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/unit_test_headers.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 17101,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/unit_test_headers.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/unit_test_headers_numbered.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 18102,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/unit_test_headers_numbered.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/unit_test_lists.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 24099,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/unit_test_lists.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/webvtt_example_01.vtt.json",
|
||||
"file_type": "json",
|
||||
"file_size": 7472,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/webvtt_example_01.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/webvtt_example_02.vtt.json",
|
||||
"file_type": "json",
|
||||
"file_size": 5704,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/webvtt_example_02.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/webvtt_example_03.vtt.json",
|
||||
"file_type": "json",
|
||||
"file_size": 9873,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/webvtt_example_03.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/webvtt_example_04.vtt.json",
|
||||
"file_type": "json",
|
||||
"file_size": 7851,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/webvtt_example_04.vtt.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/wiki_duck.html.json
Normal file
15
tools/benchmark-harness/fixtures/json/wiki_duck.html.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/wiki_duck.html.json",
|
||||
"file_type": "json",
|
||||
"file_size": 636994,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/wiki_duck.html.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/word_comments.json
Normal file
15
tools/benchmark-harness/fixtures/json/word_comments.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/word_comments.json",
|
||||
"file_type": "json",
|
||||
"file_size": 5686,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/word_comments.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/word_image_anchors.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 13525,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/word_image_anchors.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/word_sample.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/word_sample.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/word_sample.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 109876,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/word_sample.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/word_sample.json
Normal file
15
tools/benchmark-harness/fixtures/json/word_sample.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/word_sample.json",
|
||||
"file_type": "json",
|
||||
"file_size": 103166,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/word_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/json/word_tables.docx.json
Normal file
15
tools/benchmark-harness/fixtures/json/word_tables.docx.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/json/word_tables.docx.json",
|
||||
"file_type": "json",
|
||||
"file_size": 76177,
|
||||
"expected_frameworks": ["kreuzberg", "markitdown", "tika"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/json/word_tables.docx.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user