This commit is contained in:
14
tools/benchmark-harness/fixtures/7z_archive.json
Normal file
14
tools/benchmark-harness/fixtures/7z_archive.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/archives/documents.7z",
|
||||
"file_type": "7z",
|
||||
"file_size": 216,
|
||||
"expected_frameworks": ["kreuzberg", "tika"],
|
||||
"metadata": {
|
||||
"description": "7-Zip archive with text documents",
|
||||
"category": "archive"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/7z/documents.txt",
|
||||
"source": "manual"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/asciidoc_tables.json
Normal file
15
tools/benchmark-harness/fixtures/asciidoc_tables.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/tables.asciidoc",
|
||||
"file_type": "asciidoc",
|
||||
"file_size": 1537,
|
||||
"expected_frameworks": ["kreuzberg", "docling"],
|
||||
"metadata": {
|
||||
"description": "AsciiDoc document with multiple table examples",
|
||||
"category": "markup",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/asciidoc/asciidoc_tables.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/bib_comprehensive.json
Normal file
14
tools/benchmark-harness/fixtures/bib_comprehensive.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/bibtex/comprehensive.bib",
|
||||
"file_type": "bib",
|
||||
"file_size": 3568,
|
||||
"expected_frameworks": ["kreuzberg", "pandoc", "tika"],
|
||||
"metadata": {
|
||||
"description": "BibTeX bibliography file with multiple entries",
|
||||
"category": "academic"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/bib/bib_comprehensive.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/commonmark_sample.json
Normal file
14
tools/benchmark-harness/fixtures/commonmark_sample.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/sample.commonmark",
|
||||
"file_type": "commonmark",
|
||||
"file_size": 3036,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "CommonMark document with standard markdown elements including headers, lists, code blocks, links, emphasis, blockquotes, tables, and mixed formatting",
|
||||
"category": "text"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/commonmark/commonmark_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-comma-in-cell.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-comma-in-cell.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-comma-in-cell.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 46,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-comma.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-comma.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-comma.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 1005,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-inconsistent-header.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 42,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-pipe.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-pipe.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-pipe.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-pipe.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-pipe.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-semicolon.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-semicolon.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-semicolon.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-tab.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-tab.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-tab.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-tab.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-tab.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-too-few-columns.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 44,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-too-many-columns.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 46,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/data_table.json
Normal file
16
tools/benchmark-harness/fixtures/csv/data_table.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/csv/data_table.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 476,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "csv test: data_table",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/data_table.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/data_table.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/stanley_cups.json
Normal file
16
tools/benchmark-harness/fixtures/csv/stanley_cups.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/csv/stanley_cups.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 91,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "csv test: stanley_cups",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/stanley_cups.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/stanley_cups.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/test_mskanji.json
Normal file
16
tools/benchmark-harness/fixtures/csv/test_mskanji.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/csv/test_mskanji.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 70,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/test_mskanji.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/test_mskanji.md"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/dbf_stations.json
Normal file
14
tools/benchmark-harness/fixtures/dbf_stations.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/dbf/stations.dbf",
|
||||
"file_type": "dbf",
|
||||
"file_size": 87623,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "dBASE file with station records",
|
||||
"category": "tables"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/dbf/dbf_stations.txt",
|
||||
"source": "manual"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/dbk_chapter.json
Normal file
16
tools/benchmark-harness/fixtures/dbk_chapter.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-chapter.dbk",
|
||||
"file_type": "dbk",
|
||||
"file_size": 1088,
|
||||
"expected_frameworks": ["kreuzberg", "pandoc"],
|
||||
"metadata": {
|
||||
"description": "DocBook XML chapter with recursive sections (DBK extension)",
|
||||
"category": "markup",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/djot_tables.json
Normal file
14
tools/benchmark-harness/fixtures/djot_tables.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/tables.djot",
|
||||
"file_type": "djot",
|
||||
"file_size": 2102,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Djot markup with tables",
|
||||
"category": "markup"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/djot/djot_tables.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 18432,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 27648,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/fake.json
Normal file
16
tools/benchmark-harness/fixtures/doc/fake.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/fake.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 18432,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/fake.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/fake.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/simple.json
Normal file
16
tools/benchmark-harness/fixtures/doc/simple.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/simple.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 15872,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/simple.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/simple.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/unit_test_lists.json
Normal file
16
tools/benchmark-harness/fixtures/doc/unit_test_lists.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/doc/unit_test_lists.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 16384,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "doc test: unit_test_lists",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_chapter.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_chapter.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-chapter.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 1088,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook chapter with structured content",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/docbook_reader.json
Normal file
14
tools/benchmark-harness/fixtures/docbook_reader.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-reader.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 37139,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook reader with larger content",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook_reader.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_tables4.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_tables4.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/tables.docbook4",
|
||||
"file_type": "docbook",
|
||||
"file_size": 7502,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook 4 table examples with simple, multiline, and headerless tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_tables5.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_tables5.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/tables.docbook5",
|
||||
"file_type": "docbook",
|
||||
"file_size": 7502,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook 5 table examples with simple, multiline, and headerless tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_xref.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_xref.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-xref.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 3129,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook cross-reference examples with XRef, links, figures, and tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-xref.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-xref.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_grouped_images.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 207463,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_rich_cells.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 24320,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/docx_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12725,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: docx_tables",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_tables.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15814,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DOCX test document: equations",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/equations.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/equations.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/extraction_test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 11296,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: extraction_test",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/extraction_test.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/extraction_test.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/fake.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 36602,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: fake",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/fake.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/fake.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/issue_359_list_whitespace.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 9170,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: issue_359_list_whitespace",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/list_after_num_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15698,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/lorem_ipsum.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14817,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/rlink.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13708,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/rlink.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/rlink.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/sample_document.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: sample_document",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/sample_document.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/sample_document.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/table_with_equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14228,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/table_with_equations.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/table_with_equations.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/tablecell.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15180,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/tablecell.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/tablecell.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 135824,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/test_emf_docx.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 426097,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test_with_comment.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12971,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_with_comment.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_with_comment.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/textbox.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 49206,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/textbox.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/textbox.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_formatting.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 29099,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13903,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers_numbered.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16880,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_lists.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15769,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_comments.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 37399,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_comments.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_comments.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_image_anchors.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 18560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_sample.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_sample.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_sample.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16404,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_tables.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_tables.md"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_equations.json
Normal file
15
tools/benchmark-harness/fixtures/docx_equations.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15017,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "DOCX with mathematical equations - 15KB document with complex formatting",
|
||||
"category": "docx-equations",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_equations.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_images.json
Normal file
15
tools/benchmark-harness/fixtures/docx_images.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/word_image_anchors.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 18560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "DOCX with embedded images and anchors - 18KB document",
|
||||
"category": "docx-images",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_images.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_large_formatted.json
Normal file
15
tools/benchmark-harness/fixtures/docx_large_formatted.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/test_emf_docx.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 426097,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Large formatted DOCX - 416KB document with EMF graphics",
|
||||
"category": "docx-complex",
|
||||
"size_class": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_large_formatted.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/docx_simple.json
Normal file
14
tools/benchmark-harness/fixtures/docx_simple.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/lorem_ipsum.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14817,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Simple DOCX - Lorem ipsum text",
|
||||
"category": "text"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_simple.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-equals-attachment-filename.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 3297,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-equals-attachment-filename.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-inline-content-disposition.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 657,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-inline-content-disposition.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-html-content-1.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 7721,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-html-content-1.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2008-07-16.062410.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 31978,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2008-07-16.062410.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2014-03-17.111517.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 14954,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2014-03-17.111517.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-1.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 16085,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-1.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-2.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 26271,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-2.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-3.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 56028,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-3.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-4.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 34433,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-4.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-5.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 14567,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-5.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/email-with-image.json
Normal file
15
tools/benchmark-harness/fixtures/eml/email-with-image.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-with-image.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 296696,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-with-image.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-attachment.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1704,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-attachment.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-b64.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-b64.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-b64.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 979,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-b64.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-header.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-header.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-header.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1207,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-header.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-image-embedded.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 297126,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-image-embedded.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-malformed-encoding.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 898,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-malformed-encoding.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-be.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1614,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-be.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-le.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1614,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-le.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-utf-16.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-utf-16.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1616,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 807,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-encrypted.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-encrypted.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-encrypted.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 669,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-encrypted.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/family-day.json
Normal file
15
tools/benchmark-harness/fixtures/eml/family-day.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/family-day.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1291,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/family-day.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-attach-mp3.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-attach-mp3.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-attach-mp3.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 70911,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-attach-mp3.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-different-plain-html.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1397,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-different-plain-html.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-html-only.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-html-only.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-html-only.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 640,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-html-only.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multi-to-cc-bcc.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 350,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-multi-to-cc-bcc.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multipart-digest.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 721,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-multipart-digest.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-body.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-body.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-body.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 985,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-body.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-subject.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-subject.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-subject.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 162,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-subject.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-to.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-to.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-to.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 264,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-to.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-simple.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-simple.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-simple.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 452,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-simple.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-word-encoded-subject.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 261,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-word-encoded-subject.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/rfc822-no-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/rfc822-no-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/rfc822-no-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 232,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/rfc822-no-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/simple-rfc-822.json
Normal file
15
tools/benchmark-harness/fixtures/eml/simple-rfc-822.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/simple-rfc-822.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 679,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/simple-rfc-822.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-invalid-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-invalid-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-invalid-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 161,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-invalid-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-iso-8601-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-iso-8601-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-iso-8601-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 135,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-iso-8601-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-rfc2822-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-rfc2822-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-rfc2822-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 151,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-rfc2822-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/eml_attachments.json
Normal file
14
tools/benchmark-harness/fixtures/eml_attachments.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/email/mailgun_pdf_attachment.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1514,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Email with PDF attachment from Mailgun",
|
||||
"category": "attachments"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/eml/eml_attachments.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/eml_html_only.json
Normal file
14
tools/benchmark-harness/fixtures/eml_html_only.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/email/html_only.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1150,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "HTML-only email with formatted content",
|
||||
"category": "simple"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/eml/eml_html_only.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/eml_multipart.json
Normal file
14
tools/benchmark-harness/fixtures/eml_multipart.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/email/multipart_email.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 763,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Multipart email with both plain text and HTML alternatives",
|
||||
"category": "multipart"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/eml/eml_multipart.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/eml_simple.json
Normal file
14
tools/benchmark-harness/fixtures/eml_simple.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/email/plain_text_only.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 317,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Simple plain text email without attachments",
|
||||
"category": "simple"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/eml/eml_simple.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/enw_citation.json
Normal file
15
tools/benchmark-harness/fixtures/enw_citation.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/data_formats/sample.enw",
|
||||
"file_type": "enw",
|
||||
"file_size": 242,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "EndNote citation format with journal article metadata",
|
||||
"category": "citation",
|
||||
"size_class": "tiny"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/enw/enw_citation.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/epub/epub2_cover.json
Normal file
16
tools/benchmark-harness/fixtures/epub/epub2_cover.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/epub/epub2_cover.epub",
|
||||
"file_type": "epub",
|
||||
"file_size": 11794,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "epub test: epub2_cover",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/epub/epub2_cover.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/epub/epub2_cover.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user