This commit is contained in:
28
tools/benchmark-harness/fixtures/pdf/2023-06-20-PV.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/2023-06-20-PV.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/2023-06-20-PV.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 35535,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/2023-06-20-PV.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/2023-06-20-PV.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/2203.01017v2.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/2203.01017v2.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/2203.01017v2.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 7215177,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/2203.01017v2.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/2203.01017v2.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/2206.01062.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/2206.01062.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/2206.01062.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 4310680,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/2206.01062.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/2206.01062.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/2305.03393v1-pg9.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/2305.03393v1-pg9.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/2305.03393v1-pg9.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 162131,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1-pg9.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1-pg9.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/2305.03393v1.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/2305.03393v1.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/2305.03393v1.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 4318934,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/2305.03393v1.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/4833695495684096.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/4833695495684096.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/4833695495684096.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 3759,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/4833695495684096.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/4833695495684096.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/pdf/RECEIPT-2024-TXN-98765_retail_purchase.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 8459,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/RECEIPT-2024-TXN-98765_retail_purchase.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/RECEIPT-2024-TXN-98765_retail_purchase.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/pdf/REPAIR-2022-INV-001_multipage.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 176629,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/REPAIR-2022-INV-001_multipage.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/REPAIR-2022-INV-001_multipage.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/pdf/SPARSE-2024-INV-1234_borderless_table.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 19649,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/SPARSE-2024-INV-1234_borderless_table.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/SPARSE-2024-INV-1234_borderless_table.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 478619,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/WARN-Report-for-7-1-2015-to-03-25-2016.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/amt_handbook_sample.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 673416,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/amt_handbook_sample.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/amt_handbook_sample.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/annotations-unicode-issues.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 57927,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/annotations-unicode-issues.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/annotations-unicode-issues.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/code_and_formula.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/code_and_formula.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/code_and_formula.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 89031,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/code_and_formula.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/code_and_formula.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/cupertino_usd_4-6-16.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 80578,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/cupertino_usd_4-6-16.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/cupertino_usd_4-6-16.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/docling.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/docling.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/docling.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 5566575,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Docling technical paper PDF",
|
||||
"source": "kreuzberg",
|
||||
"size_category": "large"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/docling.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/docling.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/extra-attrs-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 15170,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/extra-attrs-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/extra-attrs-example.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/federal-register-2020-17221.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 713992,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/federal-register-2020-17221.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/federal-register-2020-17221.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/figure_structure.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/figure_structure.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/figure_structure.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 24694,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/figure_structure.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/figure_structure.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/hello_structure.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/hello_structure.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/hello_structure.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 2641,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/hello_structure.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/hello_structure.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/iso_21111_10.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/iso_21111_10.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/iso_21111_10_2021_road_vehicles_in_vehicle_ethernet_conformance_test_plans.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 2343170,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "ISO 21111-10:2021 Road vehicles — In-vehicle Ethernet — Conformance test plans. Table-heavy standards document.",
|
||||
"source": "iso",
|
||||
"size_category": "large"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/iso_21111_10_2021.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/iso_21111_10_2021.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1114-dedupe-chars.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 43483,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1114-dedupe-chars.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1114-dedupe-chars.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-1181.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-1181.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1181.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 13437,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1181.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1181.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-1279-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-1279-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-1279-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 37214,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-1279-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-1279-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-140-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-140-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-140-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 5251,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-140-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-140-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-316-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-316-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-316-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 138691,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-316-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-316-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-461-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-461-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-461-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 154889,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-461-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-461-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-463-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-463-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-463-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 88676,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-463-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-463-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-466-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-466-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-466-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 13569,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-466-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-466-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-53-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-53-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-53-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 133586,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-53-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-53-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-598-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-598-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-598-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 504551,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-598-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-598-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-842-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-842-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-842-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 287855,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-842-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-842-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-848.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-848.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-848.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 71805,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-848.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-848.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-905.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-905.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/searchable.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 18810,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Searchable PDF with extractable text content",
|
||||
"source": "kreuzberg",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-905.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-905.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-982-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-982-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-982-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 1029061,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-982-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-982-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/issue-987-test.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/issue-987-test.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/issue-987-test.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 8805,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/issue-987-test.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/issue-987-test.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/la-precinct-bulletin-2014-p1.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 20188,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/la-precinct-bulletin-2014-p1.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/la-precinct-bulletin-2014-p1.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/line-char-render-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 13649,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/line-char-render-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/line-char-render-example.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/pdf/masterformat_partial_numbering.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 2114,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/masterformat_partial_numbering.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/masterformat_partial_numbering.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/mcid_example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/mcid_example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/mcid_example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 24694,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/mcid_example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/mcid_example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/multi_page.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/multi_page.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/pdf/multi_page.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 128322,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/multi_page.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/multi_page.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/nics-background-checks-2015-11-rotated.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 90415,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11-rotated.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11-rotated.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/nics-background-checks-2015-11.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 90468,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nics-background-checks-2015-11.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_001.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_001.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_001.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 525343,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_001.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_001.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_002.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_002.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_002.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 126909,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_002.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_002.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_003.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_003.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_003.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 205324,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_003.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_003.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_004.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_004.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_004.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 63070,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_004.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_004.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_005.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_005.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_005.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 932335,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_005.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_005.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_006.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_006.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_006.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 80325,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_006.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_006.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_007.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_007.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_007.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 185508,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_007.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_007.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_008.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_008.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_008.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 143492,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_008.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_008.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_009.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_009.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_009.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 498806,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_009.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_009.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_010.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_010.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_010.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 107951,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_010.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_010.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_011.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_011.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_011.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 191109,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_011.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_011.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_012.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_012.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_012.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 141727,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_012.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_012.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_013.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_013.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_013.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 61858,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_013.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_013.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_014.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_014.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_014.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 848830,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_014.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_014.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_015.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_015.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_015.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 46652,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_015.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_015.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_016.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_016.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_016.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 454010,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_016.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_016.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_017.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_017.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_017.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 911122,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_017.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_017.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_018.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_018.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_018.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 516747,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_018.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_018.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_019.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_019.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_019.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 183646,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_019.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_019.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_020.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_020.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_020.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 115368,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_020.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_020.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_021.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_021.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_021.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 469199,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_021.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_021.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_022.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_022.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_022.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 34841,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_022.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_022.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_023.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_023.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_023.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 74186,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_023.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_023.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_024.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_024.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_024.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 46310,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_024.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_024.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_025.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_025.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_025.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 2094,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_025.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_025.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_026.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_026.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_026.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 44245,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_026.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_026.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_027.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_027.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_027.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 390183,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_027.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_027.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_028.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_028.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_028.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 269676,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_028.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_028.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_029.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_029.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_029.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 274635,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_029.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_029.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_030.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_030.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_030.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 772983,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_030.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_030.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_031.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_031.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_031.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 40157,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_031.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_031.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_032.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_032.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_032.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 112154,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_032.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_032.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_033.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_033.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_033.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 181946,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_033.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_033.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_034.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_034.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_034.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 219515,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_034.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_034.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_035.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_035.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_035.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 813601,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_035.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_035.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_036.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_036.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_036.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 399271,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_036.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_036.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_037.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_037.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_037.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 561676,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_037.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_037.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_039.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_039.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_039.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 69496,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_039.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_039.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_040.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_040.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_040.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 476639,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_040.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_040.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_041.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_041.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_041.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 733602,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_041.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_041.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_042.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_042.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_042.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 166420,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_042.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_042.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_043.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_043.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_043.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 119114,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_043.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_043.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_044.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_044.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_044.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 107812,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_044.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_044.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_045.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_045.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_045.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 129544,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_045.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_045.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_046.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_046.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_046.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 594787,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_046.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_046.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_047.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_047.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_047.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 420052,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_047.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_047.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_048.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_048.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_048.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 572026,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_048.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_048.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_049.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_049.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_049.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 122803,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_049.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_049.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/nougat_050.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/nougat_050.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/nougat_050.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 108353,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Nougat benchmark PDF document",
|
||||
"source": "nougat",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/nougat_050.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/nougat_050.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/page-boxes-example.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/page-boxes-example.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/page-boxes-example.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 1187,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/page-boxes-example.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/page-boxes-example.md"
|
||||
}
|
||||
}
|
||||
28
tools/benchmark-harness/fixtures/pdf/pdf_structure.json
Normal file
28
tools/benchmark-harness/fixtures/pdf/pdf_structure.json
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/pdfplumber/pdf/pdf_structure.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 34467,
|
||||
"expected_frameworks": [
|
||||
"kreuzberg",
|
||||
"docling",
|
||||
"markitdown",
|
||||
"mineru",
|
||||
"pdfminer",
|
||||
"pdfplumber",
|
||||
"pdftotext",
|
||||
"pymupdf4llm",
|
||||
"pypdf",
|
||||
"tika",
|
||||
"unstructured"
|
||||
],
|
||||
"metadata": {
|
||||
"description": "Document from pdfplumber test suite",
|
||||
"source": "pdfplumber",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdf_structure.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdf_structure.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_001.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_001.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_001.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 44245,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_001.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_001.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_002.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_002.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_002.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 390183,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_002.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_002.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_003.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_003.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_003.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 269676,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_003.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_003.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_004.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_004.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_004.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 274635,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_004.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_004.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_005.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_005.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_005.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 772983,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_005.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_005.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_006.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_006.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_006.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 40157,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_006.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_006.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/pdf/pdfa_007.json
Normal file
16
tools/benchmark-harness/fixtures/pdf/pdfa_007.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/pdf/pdfa_007.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 112154,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "PDF/A benchmark document",
|
||||
"source": "pdfa",
|
||||
"size_category": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/pdf/pdfa_007.txt",
|
||||
"source": "mistral-pixtral",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/pdf/pdfa_007.md"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user