Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/complex_table.html",
"file_type": "html",
"file_size": 2697,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "HTML test document: complex_table",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/html/complex_table.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/html/complex_table.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_01.html",
"file_type": "html",
"file_size": 680,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_01.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_01.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_02.html",
"file_type": "html",
"file_size": 461,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_02.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_02.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_03.html",
"file_type": "html",
"file_size": 1932,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_03.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_03.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_04.html",
"file_type": "html",
"file_size": 676,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_04.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_04.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_05.html",
"file_type": "html",
"file_size": 469,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_05.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_05.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_06.html",
"file_type": "html",
"file_size": 475,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_06.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_06.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_07.html",
"file_type": "html",
"file_size": 1227,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_07.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_07.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/example_08.html",
"file_type": "html",
"file_size": 2841,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/example_08.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/example_08.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/formatting.html",
"file_type": "html",
"file_size": 1430,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/formatting.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/formatting.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/hip_13044_b.html",
"file_type": "html",
"file_size": 278128,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "html test: hip_13044_b",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/html/hip_13044_b.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/html/hip_13044_b.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/html.html",
"file_type": "html",
"file_size": 1397,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "HTML test document: html",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/html/html.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/html/html.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/html_code_snippets.html",
"file_type": "html",
"file_size": 1496,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/html_code_snippets.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/html_code_snippets.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/html_heading_in_p.html",
"file_type": "html",
"file_size": 3567,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/html_heading_in_p.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/html_heading_in_p.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/html_rich_table_cells.html",
"file_type": "html",
"file_size": 4552,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/html_rich_table_cells.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/html_rich_table_cells.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/hyperlink_01.html",
"file_type": "html",
"file_size": 262,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_01.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_01.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/hyperlink_03.html",
"file_type": "html",
"file_size": 1590,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_03.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_03.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/hyperlink_04.html",
"file_type": "html",
"file_size": 128,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_04.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_04.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/hyperlink_05.html",
"file_type": "html",
"file_size": 1135,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/hyperlink_05.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/hyperlink_05.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/international_emergency_medicine.html",
"file_type": "html",
"file_size": 352033,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "html test: international_emergency_medicine",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/international_emergency_medicine.txt",
"markdown_file": "../../../../test_documents/ground_truth/html/international_emergency_medicine.md",
"source": "pandoc"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/simple_table.html",
"file_type": "html",
"file_size": 1490,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "HTML test document: simple_table",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/html/simple_table.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/html/simple_table.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/sinthgunt.html",
"file_type": "html",
"file_size": 125362,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "HTML test document: sinthgunt",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"markdown_file": "../../../../test_documents/ground_truth/html/sinthgunt.md",
"source": "pandoc",
"text_file": "../../../../test_documents/ground_truth/html/sinthgunt.txt"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_01.html",
"file_type": "html",
"file_size": 544,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_01.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_01.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_02.html",
"file_type": "html",
"file_size": 594,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_02.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_02.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_03.html",
"file_type": "html",
"file_size": 675,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_03.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_03.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_04.html",
"file_type": "html",
"file_size": 713,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_04.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_04.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_05.html",
"file_type": "html",
"file_size": 856,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_05.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_05.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_06.html",
"file_type": "html",
"file_size": 2261,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_06.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_06.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_with_heading_01.html",
"file_type": "html",
"file_size": 521,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_with_heading_01.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_with_heading_01.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/table_with_heading_02.html",
"file_type": "html",
"file_size": 650,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/table_with_heading_02.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/table_with_heading_02.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/html/taylor_swift.html",
"file_type": "html",
"file_size": 4118693,
"expected_frameworks": ["kreuzberg"],
"metadata": {
"description": "html test: taylor_swift",
"source": "pandoc-generated",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/taylor_swift.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/taylor_swift.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/html/test_blog.html",
"file_type": "html",
"file_size": 25965,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/test_blog.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/test_blog.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/markitdown/html/test_wikipedia.html",
"file_type": "html",
"file_size": 394444,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from markitdown test suite",
"source": "markitdown",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/test_wikipedia.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/test_wikipedia.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/unit_test_01.html",
"file_type": "html",
"file_size": 198,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/unit_test_01.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/unit_test_01.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/wiki_duck.html",
"file_type": "html",
"file_size": 244709,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/wiki_duck.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/wiki_duck.md"
}
}

View File

@@ -0,0 +1,16 @@
{
"document": "../../../../test_documents/vendored/docling/html/word_tables.docx.html",
"file_type": "html",
"file_size": 4748,
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
"metadata": {
"description": "Document from docling test suite",
"source": "docling",
"size_category": "small"
},
"ground_truth": {
"text_file": "../../../../test_documents/ground_truth/html/word_tables.docx.txt",
"source": "pandoc",
"markdown_file": "../../../../test_documents/ground_truth/html/word_tables.docx.md"
}
}