This commit is contained in:
16
tools/benchmark-harness/fixtures/html/complex_table.json
Normal file
16
tools/benchmark-harness/fixtures/html/complex_table.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/complex_table.html",
|
||||
"file_type": "html",
|
||||
"file_size": 2697,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "HTML test document: complex_table",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/complex_table.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/html/complex_table.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_01.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_01.html",
|
||||
"file_type": "html",
|
||||
"file_size": 680,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_01.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_02.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_02.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_02.html",
|
||||
"file_type": "html",
|
||||
"file_size": 461,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_02.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_02.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_03.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_03.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_03.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1932,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_03.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_03.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_04.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_04.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_04.html",
|
||||
"file_type": "html",
|
||||
"file_size": 676,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_04.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_04.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_05.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_05.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_05.html",
|
||||
"file_type": "html",
|
||||
"file_size": 469,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_05.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_05.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_06.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_06.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_06.html",
|
||||
"file_type": "html",
|
||||
"file_size": 475,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_06.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_06.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_07.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_07.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_07.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1227,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_07.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_07.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/example_08.json
Normal file
16
tools/benchmark-harness/fixtures/html/example_08.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/example_08.html",
|
||||
"file_type": "html",
|
||||
"file_size": 2841,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/example_08.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/example_08.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/formatting.json
Normal file
16
tools/benchmark-harness/fixtures/html/formatting.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/formatting.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1430,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/formatting.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/formatting.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/hip_13044_b.json
Normal file
16
tools/benchmark-harness/fixtures/html/hip_13044_b.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/hip_13044_b.html",
|
||||
"file_type": "html",
|
||||
"file_size": 278128,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "html test: hip_13044_b",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/hip_13044_b.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/html/hip_13044_b.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/html.json
Normal file
16
tools/benchmark-harness/fixtures/html/html.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/html.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1397,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "HTML test document: html",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/html.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/html/html.txt"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/html_code_snippets.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1496,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/html_code_snippets.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/html_code_snippets.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/html_heading_in_p.json
Normal file
16
tools/benchmark-harness/fixtures/html/html_heading_in_p.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/html_heading_in_p.html",
|
||||
"file_type": "html",
|
||||
"file_size": 3567,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/html_heading_in_p.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/html_heading_in_p.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/html_rich_table_cells.html",
|
||||
"file_type": "html",
|
||||
"file_size": 4552,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/html_rich_table_cells.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/html_rich_table_cells.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/hyperlink_01.json
Normal file
16
tools/benchmark-harness/fixtures/html/hyperlink_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/hyperlink_01.html",
|
||||
"file_type": "html",
|
||||
"file_size": 262,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_01.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/hyperlink_03.json
Normal file
16
tools/benchmark-harness/fixtures/html/hyperlink_03.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/hyperlink_03.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1590,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_03.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_03.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/hyperlink_04.json
Normal file
16
tools/benchmark-harness/fixtures/html/hyperlink_04.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/hyperlink_04.html",
|
||||
"file_type": "html",
|
||||
"file_size": 128,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_04.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_04.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/hyperlink_05.json
Normal file
16
tools/benchmark-harness/fixtures/html/hyperlink_05.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/hyperlink_05.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1135,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_05.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_05.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/international_emergency_medicine.html",
|
||||
"file_type": "html",
|
||||
"file_size": 352033,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "html test: international_emergency_medicine",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/international_emergency_medicine.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/international_emergency_medicine.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/simple_table.json
Normal file
16
tools/benchmark-harness/fixtures/html/simple_table.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/simple_table.html",
|
||||
"file_type": "html",
|
||||
"file_size": 1490,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "HTML test document: simple_table",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/simple_table.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/html/simple_table.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/sinthgunt.json
Normal file
16
tools/benchmark-harness/fixtures/html/sinthgunt.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/sinthgunt.html",
|
||||
"file_type": "html",
|
||||
"file_size": 125362,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "HTML test document: sinthgunt",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/sinthgunt.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/html/sinthgunt.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_01.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_01.html",
|
||||
"file_type": "html",
|
||||
"file_size": 544,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_01.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_02.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_02.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_02.html",
|
||||
"file_type": "html",
|
||||
"file_size": 594,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_02.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_02.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_03.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_03.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_03.html",
|
||||
"file_type": "html",
|
||||
"file_size": 675,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_03.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_03.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_04.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_04.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_04.html",
|
||||
"file_type": "html",
|
||||
"file_size": 713,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_04.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_04.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_05.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_05.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_05.html",
|
||||
"file_type": "html",
|
||||
"file_size": 856,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_05.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_05.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/table_06.json
Normal file
16
tools/benchmark-harness/fixtures/html/table_06.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_06.html",
|
||||
"file_type": "html",
|
||||
"file_size": 2261,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_06.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_06.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_with_heading_01.html",
|
||||
"file_type": "html",
|
||||
"file_size": 521,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_with_heading_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_with_heading_01.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/table_with_heading_02.html",
|
||||
"file_type": "html",
|
||||
"file_size": 650,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/table_with_heading_02.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/table_with_heading_02.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/taylor_swift.json
Normal file
16
tools/benchmark-harness/fixtures/html/taylor_swift.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/html/taylor_swift.html",
|
||||
"file_type": "html",
|
||||
"file_size": 4118693,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "html test: taylor_swift",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/taylor_swift.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/taylor_swift.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/test_blog.json
Normal file
16
tools/benchmark-harness/fixtures/html/test_blog.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/html/test_blog.html",
|
||||
"file_type": "html",
|
||||
"file_size": 25965,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/test_blog.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/test_blog.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/test_wikipedia.json
Normal file
16
tools/benchmark-harness/fixtures/html/test_wikipedia.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/html/test_wikipedia.html",
|
||||
"file_type": "html",
|
||||
"file_size": 394444,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/test_wikipedia.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/test_wikipedia.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/unit_test_01.json
Normal file
16
tools/benchmark-harness/fixtures/html/unit_test_01.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/unit_test_01.html",
|
||||
"file_type": "html",
|
||||
"file_size": 198,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/unit_test_01.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/unit_test_01.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/wiki_duck.json
Normal file
16
tools/benchmark-harness/fixtures/html/wiki_duck.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/wiki_duck.html",
|
||||
"file_type": "html",
|
||||
"file_size": 244709,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/wiki_duck.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/wiki_duck.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/html/word_tables.docx.json
Normal file
16
tools/benchmark-harness/fixtures/html/word_tables.docx.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/html/word_tables.docx.html",
|
||||
"file_type": "html",
|
||||
"file_size": 4748,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/html/word_tables.docx.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/html/word_tables.docx.md"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user