Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/2023-06-20-PV.pdf",
"file_type": "pdf",
"file_size": 35535,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/2023-06-20-PV.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/2023-06-20-PV.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/2203.01017v2.pdf",
"file_type": "pdf",
"file_size": 7215177,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/2203.01017v2.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/2203.01017v2.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/2206.01062.pdf",
"file_type": "pdf",
"file_size": 4310680,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/2206.01062.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/2206.01062.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/2305.03393v1-pg9.pdf",
"file_type": "pdf",
"file_size": 162131,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1-pg9.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1-pg9.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/2305.03393v1.pdf",
"file_type": "pdf",
"file_size": 4318934,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/4833695495684096.pdf",
"file_type": "pdf",
"file_size": 3759,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/4833695495684096.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/4833695495684096.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/markitdown/pdf/RECEIPT-2024-TXN-98765_retail_purchase.pdf",
"file_type": "pdf",
"file_size": 8459,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/RECEIPT-2024-TXN-98765_retail_purchase.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/RECEIPT-2024-TXN-98765_retail_purchase.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/markitdown/pdf/REPAIR-2022-INV-001_multipage.pdf",
"file_type": "pdf",
"file_size": 176629,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/REPAIR-2022-INV-001_multipage.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/REPAIR-2022-INV-001_multipage.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/markitdown/pdf/SPARSE-2024-INV-1234_borderless_table.pdf",
"file_type": "pdf",
"file_size": 19649,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/SPARSE-2024-INV-1234_borderless_table.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/SPARSE-2024-INV-1234_borderless_table.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.pdf",
"file_type": "pdf",
"file_size": 478619,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/amt_handbook_sample.pdf",
"file_type": "pdf",
"file_size": 673416,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/amt_handbook_sample.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/amt_handbook_sample.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/annotations-unicode-issues.pdf",
"file_type": "pdf",
"file_size": 57927,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/annotations-unicode-issues.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/annotations-unicode-issues.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/code_and_formula.pdf",
"file_type": "pdf",
"file_size": 89031,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/code_and_formula.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/code_and_formula.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/cupertino_usd_4-6-16.pdf",
"file_type": "pdf",
"file_size": 80578,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/cupertino_usd_4-6-16.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/cupertino_usd_4-6-16.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/pdf/docling.pdf",
"file_type": "pdf",
"file_size": 5566575,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Docling technical paper PDF",
"source": "kreuzberg",
"size_category": "large"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/docling.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/docling.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/extra-attrs-example.pdf",
"file_type": "pdf",
"file_size": 15170,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/extra-attrs-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/extra-attrs-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/federal-register-2020-17221.pdf",
"file_type": "pdf",
"file_size": 713992,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/federal-register-2020-17221.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/federal-register-2020-17221.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/figure_structure.pdf",
"file_type": "pdf",
"file_size": 24694,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/figure_structure.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/figure_structure.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/hello_structure.pdf",
"file_type": "pdf",
"file_size": 2641,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/hello_structure.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/hello_structure.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/iso_21111_10_2021_road_vehicles_in_vehicle_ethernet_conformance_test_plans.pdf",
"file_type": "pdf",
"file_size": 2343170,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "ISO 21111-10:2021 Road vehicles — In-vehicle Ethernet — Conformance test plans. Table-heavy standards document.",
"source": "iso",
"size_category": "large"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/iso_21111_10_2021.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/iso_21111_10_2021.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1114-dedupe-chars.pdf",
"file_type": "pdf",
"file_size": 43483,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1114-dedupe-chars.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1114-dedupe-chars.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1181.pdf",
"file_type": "pdf",
"file_size": 13437,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1181.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1181.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1279-example.pdf",
"file_type": "pdf",
"file_size": 37214,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1279-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1279-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-140-example.pdf",
"file_type": "pdf",
"file_size": 5251,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-140-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-140-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-316-example.pdf",
"file_type": "pdf",
"file_size": 138691,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-316-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-316-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-461-example.pdf",
"file_type": "pdf",
"file_size": 154889,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-461-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-461-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-463-example.pdf",
"file_type": "pdf",
"file_size": 88676,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-463-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-463-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-466-example.pdf",
"file_type": "pdf",
"file_size": 13569,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-466-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-466-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-53-example.pdf",
"file_type": "pdf",
"file_size": 133586,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-53-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-53-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-598-example.pdf",
"file_type": "pdf",
"file_size": 504551,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-598-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-598-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-842-example.pdf",
"file_type": "pdf",
"file_size": 287855,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-842-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-842-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-848.pdf",
"file_type": "pdf",
"file_size": 71805,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-848.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-848.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/pdf/searchable.pdf",
"file_type": "pdf",
"file_size": 18810,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Searchable PDF with extractable text content",
"source": "kreuzberg",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-905.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-905.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-982-example.pdf",
"file_type": "pdf",
"file_size": 1029061,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-982-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-982-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-987-test.pdf",
"file_type": "pdf",
"file_size": 8805,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/issue-987-test.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-987-test.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/la-precinct-bulletin-2014-p1.pdf",
"file_type": "pdf",
"file_size": 20188,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/la-precinct-bulletin-2014-p1.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/la-precinct-bulletin-2014-p1.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/line-char-render-example.pdf",
"file_type": "pdf",
"file_size": 13649,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/line-char-render-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/line-char-render-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/markitdown/pdf/masterformat_partial_numbering.pdf",
"file_type": "pdf",
"file_size": 2114,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/masterformat_partial_numbering.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/masterformat_partial_numbering.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/mcid_example.pdf",
"file_type": "pdf",
"file_size": 24694,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/mcid_example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/mcid_example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/docling/pdf/multi_page.pdf",
"file_type": "pdf",
"file_size": 128322,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/multi_page.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/multi_page.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/nics-background-checks-2015-11-rotated.pdf",
"file_type": "pdf",
"file_size": 90415,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11-rotated.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11-rotated.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/nics-background-checks-2015-11.pdf",
"file_type": "pdf",
"file_size": 90468,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_001.pdf",
"file_type": "pdf",
"file_size": 525343,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_001.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_001.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_002.pdf",
"file_type": "pdf",
"file_size": 126909,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_002.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_002.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_003.pdf",
"file_type": "pdf",
"file_size": 205324,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_003.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_003.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_004.pdf",
"file_type": "pdf",
"file_size": 63070,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_004.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_004.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_005.pdf",
"file_type": "pdf",
"file_size": 932335,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_005.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_005.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_006.pdf",
"file_type": "pdf",
"file_size": 80325,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_006.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_006.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_007.pdf",
"file_type": "pdf",
"file_size": 185508,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_007.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_007.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_008.pdf",
"file_type": "pdf",
"file_size": 143492,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_008.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_008.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_009.pdf",
"file_type": "pdf",
"file_size": 498806,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_009.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_009.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_010.pdf",
"file_type": "pdf",
"file_size": 107951,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_010.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_010.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_011.pdf",
"file_type": "pdf",
"file_size": 191109,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_011.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_011.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_012.pdf",
"file_type": "pdf",
"file_size": 141727,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_012.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_012.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_013.pdf",
"file_type": "pdf",
"file_size": 61858,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_013.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_013.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_014.pdf",
"file_type": "pdf",
"file_size": 848830,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_014.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_014.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_015.pdf",
"file_type": "pdf",
"file_size": 46652,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_015.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_015.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_016.pdf",
"file_type": "pdf",
"file_size": 454010,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_016.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_016.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_017.pdf",
"file_type": "pdf",
"file_size": 911122,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_017.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_017.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_018.pdf",
"file_type": "pdf",
"file_size": 516747,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_018.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_018.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_019.pdf",
"file_type": "pdf",
"file_size": 183646,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_019.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_019.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_020.pdf",
"file_type": "pdf",
"file_size": 115368,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_020.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_020.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_021.pdf",
"file_type": "pdf",
"file_size": 469199,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_021.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_021.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_022.pdf",
"file_type": "pdf",
"file_size": 34841,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_022.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_022.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_023.pdf",
"file_type": "pdf",
"file_size": 74186,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_023.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_023.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_024.pdf",
"file_type": "pdf",
"file_size": 46310,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_024.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_024.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_025.pdf",
"file_type": "pdf",
"file_size": 2094,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_025.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_025.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_026.pdf",
"file_type": "pdf",
"file_size": 44245,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_026.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_026.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_027.pdf",
"file_type": "pdf",
"file_size": 390183,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_027.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_027.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_028.pdf",
"file_type": "pdf",
"file_size": 269676,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_028.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_028.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_029.pdf",
"file_type": "pdf",
"file_size": 274635,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_029.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_029.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_030.pdf",
"file_type": "pdf",
"file_size": 772983,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_030.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_030.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_031.pdf",
"file_type": "pdf",
"file_size": 40157,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_031.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_031.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_032.pdf",
"file_type": "pdf",
"file_size": 112154,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_032.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_032.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_033.pdf",
"file_type": "pdf",
"file_size": 181946,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_033.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_033.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_034.pdf",
"file_type": "pdf",
"file_size": 219515,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_034.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_034.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_035.pdf",
"file_type": "pdf",
"file_size": 813601,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_035.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_035.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_036.pdf",
"file_type": "pdf",
"file_size": 399271,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_036.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_036.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_037.pdf",
"file_type": "pdf",
"file_size": 561676,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_037.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_037.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_039.pdf",
"file_type": "pdf",
"file_size": 69496,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_039.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_039.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_040.pdf",
"file_type": "pdf",
"file_size": 476639,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_040.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_040.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_041.pdf",
"file_type": "pdf",
"file_size": 733602,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_041.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_041.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_042.pdf",
"file_type": "pdf",
"file_size": 166420,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_042.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_042.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_043.pdf",
"file_type": "pdf",
"file_size": 119114,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_043.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_043.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_044.pdf",
"file_type": "pdf",
"file_size": 107812,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_044.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_044.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_045.pdf",
"file_type": "pdf",
"file_size": 129544,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_045.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_045.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_046.pdf",
"file_type": "pdf",
"file_size": 594787,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_046.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_046.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_047.pdf",
"file_type": "pdf",
"file_size": 420052,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_047.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_047.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_048.pdf",
"file_type": "pdf",
"file_size": 572026,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_048.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_048.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_049.pdf",
"file_type": "pdf",
"file_size": 122803,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_049.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_049.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/nougat_050.pdf",
"file_type": "pdf",
"file_size": 108353,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "Nougat benchmark PDF document",
"source": "nougat",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_050.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_050.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/page-boxes-example.pdf",
"file_type": "pdf",
"file_size": 1187,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/page-boxes-example.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/page-boxes-example.md"
}
}

View File

@@ -0,0 +1,28 @@
{
"document": "../../../../test_documents/vendored/pdfplumber/pdf/pdf_structure.pdf",
"file_type": "pdf",
"file_size": 34467,
"expected_frameworks": [
"kreuzberg",
"docling",
"markitdown",
"mineru",
"pdfminer",
"pdfplumber",
"pdftotext",
"pymupdf4llm",
"pypdf",
"tika",
"unstructured"
],
"metadata": {
"description": "Document from pdfplumber test suite",
"source": "pdfplumber",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdf_structure.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdf_structure.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_001.pdf",
"file_type": "pdf",
"file_size": 44245,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_001.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_001.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_002.pdf",
"file_type": "pdf",
"file_size": 390183,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_002.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_002.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_003.pdf",
"file_type": "pdf",
"file_size": 269676,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_003.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_003.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_004.pdf",
"file_type": "pdf",
"file_size": 274635,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_004.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_004.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_005.pdf",
"file_type": "pdf",
"file_size": 772983,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_005.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_005.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_006.pdf",
"file_type": "pdf",
"file_size": 40157,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_006.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_006.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/pdf/pdfa_007.pdf",
"file_type": "pdf",
"file_size": 112154,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "PDF/A benchmark document",
"source": "pdfa",
"size_category": "medium"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_007.txt",
"source": "mistral-pixtral",
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_007.md"
}
}

Some files were not shown because too many files have changed in this diff Show More