This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_grouped_images.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 207463,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_rich_cells.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 24320,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/docx_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12725,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: docx_tables",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_tables.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15814,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DOCX test document: equations",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/equations.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/equations.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/extraction_test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 11296,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: extraction_test",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/extraction_test.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/extraction_test.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/fake.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 36602,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: fake",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/fake.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/fake.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/issue_359_list_whitespace.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 9170,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: issue_359_list_whitespace",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/list_after_num_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15698,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/lorem_ipsum.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14817,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/rlink.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13708,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/rlink.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/rlink.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/sample_document.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: sample_document",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/sample_document.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/sample_document.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/table_with_equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14228,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/table_with_equations.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/table_with_equations.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/tablecell.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15180,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/tablecell.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/tablecell.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 135824,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/test_emf_docx.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 426097,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test_with_comment.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12971,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_with_comment.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_with_comment.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/textbox.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 49206,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/textbox.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/textbox.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_formatting.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 29099,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13903,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers_numbered.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16880,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_lists.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15769,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_comments.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 37399,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_comments.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_comments.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_image_anchors.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 18560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_sample.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_sample.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_sample.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16404,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_tables.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_tables.md"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user