Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/archives/documents.7z",
"file_type": "7z",
"file_size": 216,
"expected_frameworks": ["kreuzberg", "tika"],
"metadata": {
"description": "7-Zip archive with text documents",
"category": "archive"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/7z/documents.txt",
"source": "manual"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/markdown/tables.asciidoc",
"file_type": "asciidoc",
"file_size": 1537,
"expected_frameworks": ["kreuzberg", "docling"],
"metadata": {
"description": "AsciiDoc document with multiple table examples",
"category": "markup",
"size_class": "small"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/asciidoc/asciidoc_tables.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/bibtex/comprehensive.bib",
"file_type": "bib",
"file_size": 3568,
"expected_frameworks": ["kreuzberg", "pandoc", "tika"],
"metadata": {
"description": "BibTeX bibliography file with multiple entries",
"category": "academic"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/bib/bib_comprehensive.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/markdown/sample.commonmark",
"file_type": "commonmark",
"file_size": 3036,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "CommonMark document with standard markdown elements including headers, lists, code blocks, links, emphasis, blockquotes, tables, and mixed formatting",
"category": "text"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/commonmark/commonmark_sample.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-comma-in-cell.csv",
"file_type": "csv",
"file_size": 46,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-comma.csv",
"file_type": "csv",
"file_size": 1005,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-inconsistent-header.csv",
"file_type": "csv",
"file_size": 42,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-pipe.csv",
"file_type": "csv",
"file_size": 997,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-pipe.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-pipe.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-semicolon.csv",
"file_type": "csv",
"file_size": 997,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-tab.csv",
"file_type": "csv",
"file_size": 997,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-tab.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-tab.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-too-few-columns.csv",
"file_type": "csv",
"file_size": 44,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/csv/csv-too-many-columns.csv",
"file_type": "csv",
"file_size": 46,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/csv/data_table.csv",
"file_type": "csv",
"file_size": 476,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "csv test: data_table",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/data_table.txt",
"markdown_file": "../../../../test_documents/ground_truth/csv/data_table.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/csv/stanley_cups.csv",
"file_type": "csv",
"file_size": 91,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "csv test: stanley_cups",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/stanley_cups.txt",
"markdown_file": "../../../../test_documents/ground_truth/csv/stanley_cups.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/csv/test_mskanji.csv",
"file_type": "csv",
"file_size": 70,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/csv/test_mskanji.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/csv/test_mskanji.md"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/dbf/stations.dbf",
"file_type": "dbf",
"file_size": 87623,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "dBASE file with station records",
"category": "tables"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/dbf/dbf_stations.txt",
"source": "manual"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../test_documents/docbook/docbook-chapter.dbk",
"file_type": "dbk",
"file_size": 1088,
"expected_frameworks": ["kreuzberg", "pandoc"],
"metadata": {
"description": "DocBook XML chapter with recursive sections (DBK extension)",
"category": "markup",
"size_class": "small"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/markdown/tables.djot",
"file_type": "djot",
"file_size": 2102,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Djot markup with tables",
"category": "markup"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/djot/djot_tables.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc",
"file_type": "doc",
"file_size": 18432,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc",
"file_type": "doc",
"file_size": 27648,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/unstructured/doc/fake.doc",
"file_type": "doc",
"file_size": 18432,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/doc/fake.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/doc/fake.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/unstructured/doc/simple.doc",
"file_type": "doc",
"file_size": 15872,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/doc/simple.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/doc/simple.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/doc/unit_test_lists.doc",
"file_type": "doc",
"file_size": 16384,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "doc test: unit_test_lists",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.txt",
"markdown_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docbook/docbook-chapter.docbook",
"file_type": "docbook",
"file_size": 1088,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DocBook chapter with structured content",
"category": "documentation"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/docbook/docbook-reader.docbook",
"file_type": "docbook",
"file_size": 37139,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DocBook reader with larger content",
"category": "documentation"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/docbook_reader.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docbook/tables.docbook4",
"file_type": "docbook",
"file_size": 7502,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DocBook 4 table examples with simple, multiline, and headerless tables",
"category": "documentation"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docbook/tables.docbook5",
"file_type": "docbook",
"file_size": 7502,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DocBook 5 table examples with simple, multiline, and headerless tables",
"category": "documentation"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docbook/docbook-xref.docbook",
"file_type": "docbook",
"file_size": 3129,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DocBook cross-reference examples with XRef, links, figures, and tables",
"category": "documentation"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docbook/docbook-xref.txt",
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-xref.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/docx_grouped_images.docx",
"file_type": "docx",
"file_size": 207463,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/docx_rich_cells.docx",
"file_type": "docx",
"file_size": 24320,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/docx/docx_tables.docx",
"file_type": "docx",
"file_size": 12725,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "docx test: docx_tables",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/docx_tables.txt",
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_tables.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/equations.docx",
"file_type": "docx",
"file_size": 15814,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "DOCX test document: equations",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/docx/equations.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/docx/equations.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/docx/extraction_test.docx",
"file_type": "docx",
"file_size": 11296,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "docx test: extraction_test",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/extraction_test.txt",
"markdown_file": "../../../../test_documents/ground_truth/docx/extraction_test.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/docx/fake.docx",
"file_type": "docx",
"file_size": 36602,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "docx test: fake",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/fake.txt",
"markdown_file": "../../../../test_documents/ground_truth/docx/fake.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/docx/issue_359_list_whitespace.docx",
"file_type": "docx",
"file_size": 9170,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "docx test: issue_359_list_whitespace",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.txt",
"markdown_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/list_after_num_headers.docx",
"file_type": "docx",
"file_size": 15698,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/lorem_ipsum.docx",
"file_type": "docx",
"file_size": 14817,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/docx/rlink.docx",
"file_type": "docx",
"file_size": 13708,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/rlink.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/rlink.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/docx/sample_document.docx",
"file_type": "docx",
"file_size": 103966,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "docx test: sample_document",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/sample_document.txt",
"markdown_file": "../../../../test_documents/ground_truth/docx/sample_document.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/table_with_equations.docx",
"file_type": "docx",
"file_size": 14228,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/table_with_equations.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/table_with_equations.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/tablecell.docx",
"file_type": "docx",
"file_size": 15180,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/tablecell.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/tablecell.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/docx/test.docx",
"file_type": "docx",
"file_size": 135824,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/test.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/test.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/test_emf_docx.docx",
"file_type": "docx",
"file_size": 426097,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/docx/test_with_comment.docx",
"file_type": "docx",
"file_size": 12971,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/test_with_comment.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/test_with_comment.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/textbox.docx",
"file_type": "docx",
"file_size": 49206,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/textbox.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/textbox.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/unit_test_formatting.docx",
"file_type": "docx",
"file_size": 29099,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers.docx",
"file_type": "docx",
"file_size": 13903,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers_numbered.docx",
"file_type": "docx",
"file_size": 16880,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/unit_test_lists.docx",
"file_type": "docx",
"file_size": 15769,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/word_comments.docx",
"file_type": "docx",
"file_size": 37399,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/word_comments.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/word_comments.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/word_image_anchors.docx",
"file_type": "docx",
"file_size": 18560,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/word_sample.docx",
"file_type": "docx",
"file_size": 103966,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/word_sample.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/word_sample.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/docx/word_tables.docx",
"file_type": "docx",
"file_size": 16404,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/docx/word_tables.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/docx/word_tables.md"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docx/equations.docx",
"file_type": "docx",
"file_size": 15017,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "DOCX with mathematical equations - 15KB document with complex formatting",
"category": "docx-equations",
"size_class": "small"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docx/docx_equations.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docx/word_image_anchors.docx",
"file_type": "docx",
"file_size": 18560,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "DOCX with embedded images and anchors - 18KB document",
"category": "docx-images",
"size_class": "small"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docx/docx_images.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/docx/test_emf_docx.docx",
"file_type": "docx",
"file_size": 426097,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Large formatted DOCX - 416KB document with EMF graphics",
"category": "docx-complex",
"size_class": "medium"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docx/docx_large_formatted.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/docx/lorem_ipsum.docx",
"file_type": "docx",
"file_size": 14817,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Simple DOCX - Lorem ipsum text",
"category": "text"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/docx/docx_simple.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-equals-attachment-filename.eml",
"file_type": "eml",
"file_size": 3297,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-equals-attachment-filename.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-inline-content-disposition.eml",
"file_type": "eml",
"file_size": 657,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-inline-content-disposition.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-html-content-1.eml",
"file_type": "eml",
"file_size": 7721,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-no-html-content-1.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2008-07-16.062410.eml",
"file_type": "eml",
"file_size": 31978,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2008-07-16.062410.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2014-03-17.111517.eml",
"file_type": "eml",
"file_size": 14954,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2014-03-17.111517.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-1.eml",
"file_type": "eml",
"file_size": 16085,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-1.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-2.eml",
"file_type": "eml",
"file_size": 26271,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-2.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-3.eml",
"file_type": "eml",
"file_size": 56028,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-3.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-4.eml",
"file_type": "eml",
"file_size": 34433,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-4.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-5.eml",
"file_type": "eml",
"file_size": 14567,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-5.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/email-with-image.eml",
"file_type": "eml",
"file_size": 296696,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/email-with-image.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-attachment.eml",
"file_type": "eml",
"file_size": 1704,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-attachment.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-b64.eml",
"file_type": "eml",
"file_size": 979,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-b64.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-header.eml",
"file_type": "eml",
"file_size": 1207,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-header.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-image-embedded.eml",
"file_type": "eml",
"file_size": 297126,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-image-embedded.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-malformed-encoding.eml",
"file_type": "eml",
"file_size": 898,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-malformed-encoding.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-be.eml",
"file_type": "eml",
"file_size": 1614,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-be.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-le.eml",
"file_type": "eml",
"file_size": 1614,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-le.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16.eml",
"file_type": "eml",
"file_size": 1616,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email.eml",
"file_type": "eml",
"file_size": 807,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-email.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/fake-encrypted.eml",
"file_type": "eml",
"file_size": 669,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/fake-encrypted.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/family-day.eml",
"file_type": "eml",
"file_size": 1291,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/family-day.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-attach-mp3.eml",
"file_type": "eml",
"file_size": 70911,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-attach-mp3.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-different-plain-html.eml",
"file_type": "eml",
"file_size": 1397,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-different-plain-html.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-html-only.eml",
"file_type": "eml",
"file_size": 640,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-html-only.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multi-to-cc-bcc.eml",
"file_type": "eml",
"file_size": 350,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-multi-to-cc-bcc.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multipart-digest.eml",
"file_type": "eml",
"file_size": 721,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-multipart-digest.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-body.eml",
"file_type": "eml",
"file_size": 985,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-body.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-subject.eml",
"file_type": "eml",
"file_size": 162,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-subject.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-to.eml",
"file_type": "eml",
"file_size": 264,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-to.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-simple.eml",
"file_type": "eml",
"file_size": 452,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-simple.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/mime-word-encoded-subject.eml",
"file_type": "eml",
"file_size": 261,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/mime-word-encoded-subject.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/rfc822-no-date.eml",
"file_type": "eml",
"file_size": 232,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/rfc822-no-date.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/simple-rfc-822.eml",
"file_type": "eml",
"file_size": 679,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/simple-rfc-822.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/test-invalid-date.eml",
"file_type": "eml",
"file_size": 161,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/test-invalid-date.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/test-iso-8601-date.eml",
"file_type": "eml",
"file_size": 135,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/test-iso-8601-date.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../../test_documents/vendored/unstructured/eml/test-rfc2822-date.eml",
"file_type": "eml",
"file_size": 151,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Document from unstructured test suite",
"source": "unstructured",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/eml/test-rfc2822-date.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/email/mailgun_pdf_attachment.eml",
"file_type": "eml",
"file_size": 1514,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Email with PDF attachment from Mailgun",
"category": "attachments"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/eml/eml_attachments.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/email/html_only.eml",
"file_type": "eml",
"file_size": 1150,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "HTML-only email with formatted content",
"category": "simple"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/eml/eml_html_only.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/email/multipart_email.eml",
"file_type": "eml",
"file_size": 763,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Multipart email with both plain text and HTML alternatives",
"category": "multipart"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/eml/eml_multipart.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,14 @@
{
"document": "../../../test_documents/email/plain_text_only.eml",
"file_type": "eml",
"file_size": 317,
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
"metadata": {
"description": "Simple plain text email without attachments",
"category": "simple"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/eml/eml_simple.txt",
"source": "python_email"
}
}

View File

@@ -0,0 +1,15 @@
{
"document": "../../../test_documents/data_formats/sample.enw",
"file_type": "enw",
"file_size": 242,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "EndNote citation format with journal article metadata",
"category": "citation",
"size_class": "tiny"
},
"ground_truth": {
"text_file": "../../../test_documents/ground_truth/enw/enw_citation.txt",
"source": "vision"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/epub/epub2_cover.epub",
"file_type": "epub",
"file_size": 11794,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "epub test: epub2_cover",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/epub/epub2_cover.txt",
"markdown_file": "../../../../test_documents/ground_truth/epub/epub2_cover.md",
"source": "pandoc"
}
}

Some files were not shown because too many files have changed in this diff Show More