This commit is contained in:
55
tools/benchmark-harness/Cargo.toml
Normal file
55
tools/benchmark-harness/Cargo.toml
Normal file
@@ -0,0 +1,55 @@
|
||||
[package]
|
||||
name = "benchmark-harness"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
rust-version.workspace = true
|
||||
authors.workspace = true
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
homepage = "https://kreuzberg.dev"
|
||||
default-run = "benchmark-harness"
|
||||
|
||||
[lib]
|
||||
name = "benchmark_harness"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[[bin]]
|
||||
name = "benchmark-harness"
|
||||
path = "src/main.rs"
|
||||
|
||||
[features]
|
||||
default = []
|
||||
profiling = ["pprof"]
|
||||
memory-profiling = ["tikv-jemallocator", "tikv-jemalloc-ctl"]
|
||||
|
||||
[dependencies]
|
||||
|
||||
ahash = { workspace = true }
|
||||
|
||||
async-trait = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
|
||||
clap = { workspace = true }
|
||||
futures = { workspace = true }
|
||||
kreuzberg = { path = "../../crates/kreuzberg", features = ["full"] }
|
||||
num_cpus = { workspace = true }
|
||||
|
||||
pprof = { version = "0.15", features = ["flamegraph", "criterion"], optional = true }
|
||||
pulldown-cmark = "0.13"
|
||||
rayon = { workspace = true }
|
||||
regex = "1"
|
||||
serde = { workspace = true }
|
||||
serde_json = { workspace = true }
|
||||
shellexpand = "3"
|
||||
|
||||
sysinfo = "0.38"
|
||||
tempfile = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
tikv-jemalloc-ctl = { version = "0.7", features = ["stats"], optional = true }
|
||||
tikv-jemallocator = { version = "0.7", optional = true }
|
||||
tokio = { workspace = true, features = ["full"] }
|
||||
tracing = { workspace = true }
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
which = "8"
|
||||
|
||||
[dev-dependencies]
|
||||
394
tools/benchmark-harness/README.md
Normal file
394
tools/benchmark-harness/README.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Benchmark Harness
|
||||
|
||||
Rust CLI tool for comparative benchmarking of document extraction across 13 Kreuzberg language bindings and 12 reference frameworks. Measures performance (latency, throughput, memory) and quality (TF1, SF1) against ground truth.
|
||||
|
||||
## Overview
|
||||
|
||||
The benchmark harness serves two distinct workflows:
|
||||
|
||||
- **CI benchmarking** -- automated cross-framework comparison triggered via GitHub Actions, producing aggregated results published as GitHub Releases.
|
||||
- **Local quality assessment** -- developer-facing pipeline comparison against ground truth for extraction quality triage and regression detection.
|
||||
|
||||
## Architecture
|
||||
|
||||
```text
|
||||
CLI (clap)
|
||||
|
|
||||
+-- run --> AdapterRegistry --> BenchmarkRunner --> results.json
|
||||
| |
|
||||
| +-- NativeAdapter (in-process Kreuzberg)
|
||||
| +-- SubprocessAdapter (persistent child process)
|
||||
| +-- BatchSubprocessAdapter (batch API)
|
||||
|
|
||||
+-- compare --> ComparisonConfig --> Pipeline extraction --> Quality scoring
|
||||
+-- pipeline-benchmark --> 6-path matrix --> TF1/SF1 scoring --> Triage tables
|
||||
+-- consolidate --> Load multi-job results --> Aggregate percentiles
|
||||
+-- validate-gt --> Fixture scan --> HTML cleanup --> Integrity report
|
||||
+-- survey --> Corpus-wide extraction stats
|
||||
+-- model-benchmark --> Layout model A/B comparison
|
||||
+-- embed-benchmark --> Embedding throughput measurement
|
||||
```
|
||||
|
||||
### Module Structure
|
||||
|
||||
| Module | Purpose |
|
||||
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `main.rs` | CLI entry point (clap subcommands) |
|
||||
| `adapter.rs` | `FrameworkAdapter` trait definition |
|
||||
| `adapters/` | Adapter implementations: subprocess (persistent/batch), native (in-process), kreuzberg factory functions for all languages |
|
||||
| `runner.rs` | Benchmark orchestration, iteration control, resource monitoring |
|
||||
| `quality.rs` | TF1: token-level bag-of-words F1 scoring |
|
||||
| `markdown_quality.rs` | SF1: structural block-level F1 scoring |
|
||||
| `comparison.rs` | Multi-pipeline extraction with quality guardrails |
|
||||
| `pipeline_benchmark.rs` | 6-path extraction matrix benchmark |
|
||||
| `corpus.rs`, `fixture.rs` | Fixture loading, filtering, validation |
|
||||
| `aggregate.rs`, `consolidate.rs` | Multi-job result merging and percentile aggregation |
|
||||
| `output.rs`, `stats.rs` | Result serialization and statistical analysis |
|
||||
| `validate_gt.rs` | Ground truth integrity checks and HTML-to-GFM cleanup |
|
||||
| `monitoring.rs` | CPU and memory sampling during benchmarks |
|
||||
| `profiling.rs`, `profile_report.rs` | Flamegraph generation (requires `profiling` feature) |
|
||||
| `survey.rs` | Corpus-wide extraction statistics |
|
||||
| `model_benchmark.rs` | Layout model A/B comparison |
|
||||
| `embed_benchmark.rs` | Embedding throughput benchmarks |
|
||||
| `sizes.rs` | Framework installation footprint measurement |
|
||||
|
||||
## Quality Scoring
|
||||
|
||||
### TF1 (Text F1)
|
||||
|
||||
Token-level bag-of-words F1 between extracted text and ground truth.
|
||||
|
||||
- Tokenization: lowercase, split on whitespace, keep alphanumeric tokens plus `.` and `,`
|
||||
- Separate numeric-token F1 for number-heavy documents (financial, scientific)
|
||||
- Combined score: `quality_score = 0.6 * f1_text + 0.4 * f1_numeric`
|
||||
|
||||
### SF1 (Structural F1)
|
||||
|
||||
Block-level matching between extracted markdown and ground truth markdown.
|
||||
|
||||
- **Block types:** Heading1-6, Paragraph, CodeBlock, Formula, Table, ListItem, Image
|
||||
- **Type weights:** Headings = 2.0, Code/Formula/Table = 1.5, ListItem = 1.0, Paragraph/Image = 0.5
|
||||
- **Matching:** Greedy 1:1 with fuzzy cross-type compatibility (e.g., bold paragraph matched to heading gets 0.4 compatibility score)
|
||||
- **Adjacent concatenation:** Consecutive blocks of the same type are merged before matching
|
||||
- **Order score:** Longest Increasing Subsequence (LIS) on matched block indices
|
||||
|
||||
### Combined Score
|
||||
|
||||
When markdown ground truth is available, both metrics are combined:
|
||||
|
||||
```text
|
||||
quality_score = 0.5 * f1_text + 0.2 * f1_numeric + 0.3 * f1_layout
|
||||
```
|
||||
|
||||
## Fixture Format
|
||||
|
||||
Fixtures are JSON files organized by format directory under `fixtures/`:
|
||||
|
||||
```json
|
||||
{
|
||||
"document": "relative/path/to/file.pdf",
|
||||
"file_type": "pdf",
|
||||
"file_size": 123456,
|
||||
"expected_frameworks": ["kreuzberg", "docling"],
|
||||
"metadata": {},
|
||||
"ground_truth": {
|
||||
"text_file": "relative/path/to/gt.txt",
|
||||
"markdown_file": "relative/path/to/gt.md",
|
||||
"source": "manual|vision|pdf_text_layer|pandoc|python-docx|..."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Ground Truth Coverage
|
||||
|
||||
| Format | Fixtures | With Markdown GT |
|
||||
| ------ | -------- | ---------------- |
|
||||
| PDF | 159 | 158 |
|
||||
| HTML | 36 | 36 |
|
||||
| DOCX | 26 | 26 |
|
||||
| ODT | 19 | 19 |
|
||||
| RTF | 17 | 17 |
|
||||
| XLSX | 12 | 11 |
|
||||
| CSV | 11 | 11 |
|
||||
| EPUB | 8 | 8 |
|
||||
| PPTX | 8 | 8 |
|
||||
| Org | 6 | 6 |
|
||||
| DOC | 5 | 5 |
|
||||
| OPML | 4 | 4 |
|
||||
| RST | 3 | 3 |
|
||||
| XLS | 3 | 3 |
|
||||
| IPynb | 1 | 1 |
|
||||
| JATS | 1 | 1 |
|
||||
| LaTeX | 1 | 1 |
|
||||
|
||||
**Total:** 318 fixtures with markdown ground truth across 17 formats.
|
||||
|
||||
## Frameworks
|
||||
|
||||
### Kreuzberg Bindings (13)
|
||||
|
||||
Each binding is benchmarked in both single-file (sequential, fair latency) and batch (concurrent, throughput) modes:
|
||||
|
||||
Rust, Python, Node.js, Ruby, Go, Java, C#, PHP, Elixir, R, WASM, C, Rust+PaddleOCR
|
||||
|
||||
### Reference Frameworks (12)
|
||||
|
||||
External document extraction tools benchmarked in single-file mode:
|
||||
|
||||
Docling, MarkItDown, Pandoc, Unstructured, Tika, PyMuPDF4LLM, PDFPlumber, MinerU, PyPDF, PDFMiner, PDFtoText, Playa-PDF
|
||||
|
||||
## Extraction Pipelines
|
||||
|
||||
The `compare` and `pipeline-benchmark` commands support these extraction paths:
|
||||
|
||||
| Pipeline | Description |
|
||||
| ------------------ | ---------------------------------------------- |
|
||||
| `baseline` | Native PDF text extraction (no OCR, no layout) |
|
||||
| `layout` | Native PDF with layout detection |
|
||||
| `tesseract` | Tesseract OCR with force_ocr |
|
||||
| `tesseract+layout` | Tesseract OCR with layout detection |
|
||||
| `paddle` | PaddleOCR mobile tier with force_ocr |
|
||||
| `paddle+layout` | PaddleOCR mobile tier with layout detection |
|
||||
| `paddle-server` | PaddleOCR server tier |
|
||||
| `docling` | Vendored Docling reference extraction |
|
||||
| `paddleocr-python` | Vendored PaddleOCR Python extraction |
|
||||
| `rapidocr` | Vendored RapidOCR extraction |
|
||||
|
||||
## CLI Reference
|
||||
|
||||
### `run` -- CI benchmark execution
|
||||
|
||||
Runs benchmarks using framework adapters with configurable iterations, warmup, and sharding.
|
||||
|
||||
```bash
|
||||
benchmark-harness run \
|
||||
-f fixtures/ \
|
||||
-F kreuzberg-rust,kreuzberg-python \
|
||||
-m batch \
|
||||
-o results/ \
|
||||
-i 3 -w 1
|
||||
```
|
||||
|
||||
| Flag | Description | Default |
|
||||
| ---------------------- | ---------------------------------------------- | ------------- |
|
||||
| `-f, --fixtures` | Fixture directory or file | required |
|
||||
| `-F, --frameworks` | Comma-separated framework names | all available |
|
||||
| `-o, --output` | Output directory | `results` |
|
||||
| `-m, --mode` | `single-file` or `batch` | `batch` |
|
||||
| `-i, --iterations` | Benchmark iterations | `3` |
|
||||
| `-w, --warmup` | Warmup iterations (discarded) | `1` |
|
||||
| `-c, --max-concurrent` | Max concurrent extractions | CPU count |
|
||||
| `-t, --timeout` | Timeout in seconds | `1800` |
|
||||
| `--ocr` | Enable OCR | `false` |
|
||||
| `--measure-quality` | Enable quality assessment | `false` |
|
||||
| `--shard` | Run fixture subset (`INDEX/TOTAL`, e.g. `1/3`) | none |
|
||||
|
||||
### `consolidate` -- Merge multi-job results
|
||||
|
||||
Combines benchmark results from parallel CI jobs into a single aggregated report with percentiles.
|
||||
|
||||
```bash
|
||||
benchmark-harness consolidate \
|
||||
--inputs dir1,dir2,dir3 \
|
||||
--output consolidated/
|
||||
```
|
||||
|
||||
### `compare` -- Local pipeline comparison
|
||||
|
||||
Compares extraction pipelines on the document corpus with quality scoring and optional guardrails.
|
||||
|
||||
```bash
|
||||
benchmark-harness compare \
|
||||
-f fixtures/ \
|
||||
--pipelines baseline,layout,paddle \
|
||||
--dump-outputs \
|
||||
--guardrails
|
||||
```
|
||||
|
||||
| Flag | Description |
|
||||
| ---------------- | ----------------------------------------------------- |
|
||||
| `--pipelines` | Comma-separated pipeline names |
|
||||
| `--dump-outputs` | Write extraction outputs to `/tmp/kreuzberg_compare/` |
|
||||
| `--guardrails` | Fail on quality regressions (non-zero exit) |
|
||||
| `--filter` | Only run documents matching this substring |
|
||||
|
||||
### `pipeline-benchmark` -- 6-path extraction matrix
|
||||
|
||||
Runs all pipelines across the corpus and produces a ranked triage table.
|
||||
|
||||
```bash
|
||||
benchmark-harness pipeline-benchmark \
|
||||
-f fixtures/ \
|
||||
--group tables \
|
||||
--sort-by sf1 \
|
||||
--bottom-n 10 \
|
||||
--triage-blocks
|
||||
```
|
||||
|
||||
| Flag | Description | Default |
|
||||
| ----------------- | -------------------------------------------------------------------------------------------- | ------------------- |
|
||||
| `--paths` | Comma-separated pipeline names | all 6 default paths |
|
||||
| `--doc` | Filter by document name substrings | none |
|
||||
| `--group` | Named benchmark group (`tables`, `structure`, `multicolumn`, `text-quality`, `ocr-fallback`) | none |
|
||||
| `--sort-by` | Sort metric: `sf1`, `tf1`, `time` | `sf1` |
|
||||
| `--bottom-n` | Show only the N worst-performing documents | none |
|
||||
| `--triage-blocks` | Print per-block-type F1 breakdown | `false` |
|
||||
| `--dump-outputs` | Write outputs to `/tmp/kreuzberg_pipeline/` | `false` |
|
||||
| `--json-output` | Write JSON results to file | none |
|
||||
| `--profile-dir` | Generate per-pipeline flamegraph SVGs | none |
|
||||
|
||||
### `validate-gt` -- Ground truth validation
|
||||
|
||||
Checks ground truth file integrity and optionally fixes HTML artifacts in markdown files.
|
||||
|
||||
```bash
|
||||
benchmark-harness validate-gt -f fixtures/ --fix
|
||||
```
|
||||
|
||||
### `survey` -- Corpus extraction statistics
|
||||
|
||||
Produces corpus-wide extraction statistics grouped by file type.
|
||||
|
||||
```bash
|
||||
benchmark-harness survey -f fixtures/ --types pdf,docx
|
||||
```
|
||||
|
||||
### `model-benchmark` -- Layout model A/B comparison
|
||||
|
||||
Compares two layout model presets across the fixture corpus.
|
||||
|
||||
```bash
|
||||
benchmark-harness model-benchmark -f fixtures/ --model-a fast --model-b accurate
|
||||
```
|
||||
|
||||
### `embed-benchmark` -- Embedding throughput
|
||||
|
||||
Benchmarks embedding throughput across all presets.
|
||||
|
||||
```bash
|
||||
benchmark-harness embed-benchmark
|
||||
```
|
||||
|
||||
### `list-fixtures` -- List loaded fixtures
|
||||
|
||||
```bash
|
||||
benchmark-harness list-fixtures -f fixtures/
|
||||
```
|
||||
|
||||
### `validate` -- Validate fixture JSON
|
||||
|
||||
```bash
|
||||
benchmark-harness validate -f fixtures/
|
||||
```
|
||||
|
||||
### `measure-framework-sizes` -- Installation footprints
|
||||
|
||||
Measures disk usage of all framework installations.
|
||||
|
||||
```bash
|
||||
benchmark-harness measure-framework-sizes --output sizes.json
|
||||
```
|
||||
|
||||
## CI Integration
|
||||
|
||||
The benchmark suite runs via `.github/workflows/benchmarks.yaml`, triggered by manual `workflow_dispatch`.
|
||||
|
||||
### Execution DAG
|
||||
|
||||
```text
|
||||
setup
|
||||
Build harness + FFI library + validate ground truth
|
||||
|
|
||||
v
|
||||
bench-{language} x {single-file, batch} (13 Kreuzberg binding jobs)
|
||||
|
|
||||
v
|
||||
kreuzberg-gate (wait for all Kreuzberg benchmarks)
|
||||
|
|
||||
v
|
||||
bench-{external} (12 reference framework jobs, some sharded)
|
||||
|
|
||||
v
|
||||
aggregate-and-release (consolidate all results -> GitHub Release)
|
||||
```
|
||||
|
||||
### Platform
|
||||
|
||||
- Primary: `ubuntu-24.04-arm`
|
||||
- Exception: WASM uses `ubuntu-24.04` (x86) due to V8 ARM compatibility issues
|
||||
|
||||
### Timeouts and Artifacts
|
||||
|
||||
- Per-job timeout: 6 hours (configurable per-document timeout)
|
||||
- Build artifacts retained: 7 days
|
||||
- Result artifacts retained: 30 days
|
||||
- Final output: aggregated JSON published as a GitHub Release
|
||||
|
||||
## Vendored Baselines
|
||||
|
||||
Pre-generated extraction outputs from reference tools are stored in `vendored/` for offline comparison:
|
||||
|
||||
| Directory | Source |
|
||||
| ---------------------------- | -------------------------------------------------- |
|
||||
| `vendored/docling/` | Docling extraction outputs |
|
||||
| `vendored/paddleocr-python/` | PaddleOCR Python outputs with timing (`.ms` files) |
|
||||
| `vendored/rapidocr/` | RapidOCR extraction outputs |
|
||||
|
||||
Regenerate with:
|
||||
|
||||
```bash
|
||||
python tools/benchmark-harness/scripts/generate_vendored_baselines.py
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
# Build
|
||||
cargo build -p benchmark-harness
|
||||
|
||||
# Run tests
|
||||
cargo test -p benchmark-harness
|
||||
|
||||
# Lint
|
||||
cargo clippy -p benchmark-harness -- -D warnings
|
||||
|
||||
# Local pipeline comparison
|
||||
cargo run -p benchmark-harness -- compare \
|
||||
-f tools/benchmark-harness/fixtures/ \
|
||||
--pipelines baseline,layout \
|
||||
--dump-outputs
|
||||
|
||||
# Validate ground truth
|
||||
cargo run -p benchmark-harness -- validate-gt \
|
||||
-f tools/benchmark-harness/fixtures/
|
||||
|
||||
# Full pipeline benchmark with triage
|
||||
cargo run -p benchmark-harness -- pipeline-benchmark \
|
||||
-f tools/benchmark-harness/fixtures/ \
|
||||
--sort-by sf1 --bottom-n 20 --triage-blocks
|
||||
|
||||
# Corpus survey
|
||||
cargo run -p benchmark-harness -- survey \
|
||||
-f tools/benchmark-harness/fixtures/ --types pdf
|
||||
```
|
||||
|
||||
### Optional Features
|
||||
|
||||
| Feature | Description |
|
||||
| ------------------ | ----------------------------------------- |
|
||||
| `profiling` | Enables flamegraph generation via `pprof` |
|
||||
| `memory-profiling` | Enables jemalloc-based memory profiling |
|
||||
|
||||
Build with features:
|
||||
|
||||
```bash
|
||||
cargo build -p benchmark-harness --features profiling,memory-profiling
|
||||
```
|
||||
|
||||
### Tracing
|
||||
|
||||
The harness uses `tracing` with `RUST_LOG` env-filter support. For quality scoring diagnostics:
|
||||
|
||||
```bash
|
||||
RUST_LOG=benchmark_harness::markdown_quality=debug cargo run -p benchmark-harness -- compare ...
|
||||
```
|
||||
266
tools/benchmark-harness/SCHEMA.md
Normal file
266
tools/benchmark-harness/SCHEMA.md
Normal file
@@ -0,0 +1,266 @@
|
||||
# Aggregation Schema v2.4.0
|
||||
|
||||
This document describes the structure of `aggregated.json` produced by `benchmark-harness consolidate`.
|
||||
|
||||
## Top-level Shape
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": "2.4.0",
|
||||
"by_framework_mode": {
|
||||
"<aggregate_key>": {
|
||||
/* FrameworkModeAggregation */
|
||||
}
|
||||
},
|
||||
"disk_sizes": {
|
||||
"framework": {
|
||||
/* DiskSizeInfo */
|
||||
}
|
||||
},
|
||||
"comparison": {
|
||||
/* ComparisonData */
|
||||
},
|
||||
"per_fixture_results": [
|
||||
/* PerFixtureRow[] */
|
||||
],
|
||||
"metadata": {
|
||||
/* ConsolidationMetadata */
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Output Format Discriminator
|
||||
|
||||
The `output_format` field determines:
|
||||
|
||||
- **`markdown`**: Supports all metrics including SF1 (structural F1), layout percentiles, and all ranking tables
|
||||
- **`plaintext`**: Text-only extraction; SF1 and layout percentiles are `null`; plaintext frameworks never appear in SF1 rankings
|
||||
|
||||
## by_framework_mode
|
||||
|
||||
Key format differs by framework family:
|
||||
|
||||
- **kreuzberg** (`kreuzberg-*`): `{framework_name}:{mode}` — the output format is already encoded
|
||||
in the framework name (e.g. `kreuzberg-markdown-baseline`), so repeating it in the key is
|
||||
redundant.
|
||||
- **competitors** (all other frameworks): `{framework}:{output_format}:{mode}` — format is not
|
||||
encoded in the name, so the key carries it explicitly.
|
||||
|
||||
Examples:
|
||||
|
||||
- `kreuzberg-markdown-baseline:single`
|
||||
- `kreuzberg-plaintext-paddle-ocr:batch`
|
||||
- `pdfplumber:plaintext:single`
|
||||
- `docling:markdown:single`
|
||||
|
||||
Each entry contains:
|
||||
|
||||
```json
|
||||
{
|
||||
"framework": "string", // Framework name without mode suffix
|
||||
"output_format": "markdown|plaintext", // Output format used
|
||||
"mode": "single|batch|...", // Execution mode
|
||||
"cold_start": {
|
||||
/* DurationPercentiles */
|
||||
}, // Optional, if cold start data available
|
||||
"by_file_type": {
|
||||
"pdf": {
|
||||
"file_type": "pdf",
|
||||
"no_ocr": {
|
||||
/* PerformancePercentiles */
|
||||
},
|
||||
"with_ocr": {
|
||||
/* PerformancePercentiles */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## PerformancePercentiles
|
||||
|
||||
Contains p50, p95, p99 for all metrics:
|
||||
|
||||
```json
|
||||
{
|
||||
"successful_sample_count": 42,
|
||||
"total_sample_count": 50,
|
||||
"framework_errors": 0,
|
||||
"harness_errors": 5,
|
||||
"timeouts": 3,
|
||||
"empty_content": 0,
|
||||
"error_details": {
|
||||
"error message": 2
|
||||
},
|
||||
"duration": { "p50": 100.5, "p95": 150.2, "p99": 199.9 },
|
||||
"throughput": { "p50": 5.2, "p95": 4.8, "p99": 3.1 },
|
||||
"memory": { "p50": 150.0, "p95": 200.0, "p99": 250.0 },
|
||||
"cpu": { "p50": 50.0, "p95": 75.0, "p99": 90.0 }, // Optional
|
||||
"extraction_duration": { "p50": 80.0, "p95": 120.0, "p99": 160.0 }, // Optional
|
||||
"quality": {
|
||||
/* QualityPercentiles */
|
||||
}, // Optional, if quality data available
|
||||
"success_rate_percent": 84.0
|
||||
}
|
||||
```
|
||||
|
||||
## QualityPercentiles
|
||||
|
||||
Includes p50, p95, p99 for all F1 metrics. Layout percentiles are `null` for plaintext-only frameworks:
|
||||
|
||||
```json
|
||||
{
|
||||
"f1_text_p50": 0.92,
|
||||
"f1_text_p95": 0.88,
|
||||
"f1_text_p99": 0.75,
|
||||
"f1_numeric_p50": 0.85,
|
||||
"f1_numeric_p95": 0.8,
|
||||
"f1_numeric_p99": 0.7,
|
||||
"f1_layout_p50": 0.78, // null for plaintext output format
|
||||
"f1_layout_p95": 0.72, // null for plaintext output format
|
||||
"f1_layout_p99": 0.65, // null for plaintext output format
|
||||
"quality_score_p50": 0.85,
|
||||
"quality_score_p95": 0.8,
|
||||
"quality_score_p99": 0.7
|
||||
}
|
||||
```
|
||||
|
||||
## PerFixtureRow
|
||||
|
||||
One row per unique combination of (framework, output_format, execution_mode, fixture_id, ocr):
|
||||
|
||||
```json
|
||||
{
|
||||
"framework": "kreuzberg-markdown-baseline",
|
||||
"output_format": "markdown",
|
||||
"execution_mode": "single",
|
||||
"ocr": false,
|
||||
"fixture_id": "sample_doc_1",
|
||||
"file_type": "pdf",
|
||||
"duration_ms": 125.4,
|
||||
"peak_memory_mb": 180.5,
|
||||
"f1_text": 0.92,
|
||||
"f1_layout": 0.78, // null for plaintext mode
|
||||
"f1_numeric": 0.85,
|
||||
"quality_score": 0.85,
|
||||
"correct": true,
|
||||
"success": true,
|
||||
"error_kind": null // "FrameworkError", "HarnessError", "Timeout", etc. if !success
|
||||
}
|
||||
```
|
||||
|
||||
## ComparisonData
|
||||
|
||||
Contains all cross-framework rankings split by output format for quality metrics:
|
||||
|
||||
```json
|
||||
{
|
||||
"performance_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"throughput_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"memory_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"cpu_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"quality_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"pdf_quality_ranking": [
|
||||
/* RankedFramework[] */
|
||||
],
|
||||
"pdf_tf1_ranking_markdown": [
|
||||
/* RankedFramework[] — markdown-only */
|
||||
],
|
||||
"pdf_tf1_ranking_plaintext": [
|
||||
/* RankedFramework[] — plaintext-only */
|
||||
],
|
||||
"pdf_sf1_ranking_markdown": [
|
||||
/* RankedFramework[] — markdown-only, never plaintext */
|
||||
],
|
||||
"deltas_vs_baseline": {
|
||||
"<aggregate_key>": {
|
||||
/* DeltaMetrics */
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### RankedFramework
|
||||
|
||||
```json
|
||||
{
|
||||
"framework_mode": "kreuzberg-markdown-baseline:single",
|
||||
"rank": 1,
|
||||
"value": 95.5, // The metric value (duration, throughput, etc.)
|
||||
"relative": 1.0 // Ratio relative to best (1.0 = best)
|
||||
}
|
||||
```
|
||||
|
||||
## Migration from v2.3.0 to v2.4.0
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
1. **Schema version**: Bumped to `"2.4.0"`
|
||||
2. **Kreuzberg aggregate key format**: Changed from `framework:output_format:mode` to
|
||||
`framework_name:mode` for all `kreuzberg-*` frameworks. Competitor key format
|
||||
(`framework:output_format:mode`) is unchanged.
|
||||
|
||||
### Kreuzberg Consolidation
|
||||
|
||||
Language-binding frameworks (`kreuzberg-py`, `kreuzberg-node`, `kreuzberg-rb`, `kreuzberg-go`,
|
||||
`kreuzberg-java`, `kreuzberg-csharp`, `kreuzberg-elixir`, `kreuzberg-php`, `kreuzberg-rust`, etc.)
|
||||
have been removed. They are replaced by three native pipelines run directly via the kreuzberg CLI:
|
||||
|
||||
| Pipeline | Markdown name | Plaintext name |
|
||||
| --------- | ------------------------------- | -------------------------------- |
|
||||
| Baseline | `kreuzberg-markdown-baseline` | `kreuzberg-plaintext-baseline` |
|
||||
| Layout | `kreuzberg-markdown-layout` | `kreuzberg-plaintext-layout` |
|
||||
| PaddleOCR | `kreuzberg-markdown-paddle-ocr` | `kreuzberg-plaintext-paddle-ocr` |
|
||||
|
||||
Batch variants append `-batch` to the framework name (e.g. `kreuzberg-markdown-baseline-batch`),
|
||||
which the harness normalises to aggregate key `kreuzberg-markdown-baseline:batch`.
|
||||
|
||||
### Key Format Rationale
|
||||
|
||||
The format component is implicit in the kreuzberg framework name itself. Duplicating it in the
|
||||
aggregate key (`kreuzberg-markdown-baseline:markdown:single`) would be redundant and confusing.
|
||||
Competitor names carry no format information, so they continue to need it in the key
|
||||
(`docling:markdown:single`).
|
||||
|
||||
## Migration from v2.2.0 to v2.3.0
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
1. **Schema version**: Bumped to `"2.3.0"`
|
||||
2. **Framework key format**: Changed from `framework:mode` to `framework:output_format:mode`
|
||||
3. **QualityPercentiles**: Added p95 and p99 percentiles for all F1 metrics; `f1_layout_*` fields are now optional (null for plaintext)
|
||||
4. **FrameworkModeAggregation**: Added `output_format` field
|
||||
5. **ComparisonData**: Replaced `pdf_tf1_ranking` with `pdf_tf1_ranking_markdown` and `pdf_tf1_ranking_plaintext`; `pdf_sf1_ranking` renamed to `pdf_sf1_ranking_markdown` (now markdown-only)
|
||||
|
||||
### New Fields
|
||||
|
||||
- `per_fixture_results`: Array of individual fixture results preserving per-file measurements
|
||||
- `PerFixtureRow`: New struct capturing individual extraction outcomes
|
||||
|
||||
### Plaintext-only Behavior
|
||||
|
||||
- Plaintext frameworks NEVER appear in `pdf_sf1_ranking_markdown`
|
||||
- Plaintext frameworks NEVER appear in `pdf_tf1_ranking_markdown` (they get their own `pdf_tf1_ranking_plaintext`)
|
||||
- SF1 and layout percentiles are `null` for plaintext output format
|
||||
- All performance rankings (speed, memory, throughput) include both formats without discrimination
|
||||
|
||||
## ConsolidationMetadata
|
||||
|
||||
```json
|
||||
{
|
||||
"total_results": 500,
|
||||
"framework_count": 5,
|
||||
"file_type_count": 8,
|
||||
"timestamp": "2025-05-09T10:15:30Z"
|
||||
}
|
||||
```
|
||||
3265
tools/benchmark-harness/baselines/initial_baseline.json
Normal file
3265
tools/benchmark-harness/baselines/initial_baseline.json
Normal file
File diff suppressed because it is too large
Load Diff
15
tools/benchmark-harness/build.rs
Normal file
15
tools/benchmark-harness/build.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
use std::env;
|
||||
|
||||
fn main() {
|
||||
let target = env::var("TARGET").unwrap();
|
||||
|
||||
if target.contains("darwin") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
||||
} else if target.contains("linux") {
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
||||
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
||||
}
|
||||
|
||||
println!("cargo:rerun-if-changed=build.rs");
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/7z_archive.json
Normal file
14
tools/benchmark-harness/fixtures/7z_archive.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/archives/documents.7z",
|
||||
"file_type": "7z",
|
||||
"file_size": 216,
|
||||
"expected_frameworks": ["kreuzberg", "tika"],
|
||||
"metadata": {
|
||||
"description": "7-Zip archive with text documents",
|
||||
"category": "archive"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/7z/documents.txt",
|
||||
"source": "manual"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/asciidoc_tables.json
Normal file
15
tools/benchmark-harness/fixtures/asciidoc_tables.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/tables.asciidoc",
|
||||
"file_type": "asciidoc",
|
||||
"file_size": 1537,
|
||||
"expected_frameworks": ["kreuzberg", "docling"],
|
||||
"metadata": {
|
||||
"description": "AsciiDoc document with multiple table examples",
|
||||
"category": "markup",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/asciidoc/asciidoc_tables.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/bib_comprehensive.json
Normal file
14
tools/benchmark-harness/fixtures/bib_comprehensive.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/bibtex/comprehensive.bib",
|
||||
"file_type": "bib",
|
||||
"file_size": 3568,
|
||||
"expected_frameworks": ["kreuzberg", "pandoc", "tika"],
|
||||
"metadata": {
|
||||
"description": "BibTeX bibliography file with multiple entries",
|
||||
"category": "academic"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/bib/bib_comprehensive.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/commonmark_sample.json
Normal file
14
tools/benchmark-harness/fixtures/commonmark_sample.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/sample.commonmark",
|
||||
"file_type": "commonmark",
|
||||
"file_size": 3036,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "CommonMark document with standard markdown elements including headers, lists, code blocks, links, emphasis, blockquotes, tables, and mixed formatting",
|
||||
"category": "text"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/commonmark/commonmark_sample.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-comma-in-cell.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-comma-in-cell.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-comma-in-cell.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 46,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma-in-cell.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-comma.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-comma.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-comma.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 1005,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-comma.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-comma.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-inconsistent-header.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 42,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-inconsistent-header.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-pipe.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-pipe.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-pipe.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-pipe.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-pipe.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-semicolon.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-semicolon.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-semicolon.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-semicolon.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/csv-tab.json
Normal file
16
tools/benchmark-harness/fixtures/csv/csv-tab.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-tab.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 997,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-tab.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-tab.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-too-few-columns.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 44,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-few-columns.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/csv/csv-too-many-columns.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 46,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/csv-too-many-columns.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/data_table.json
Normal file
16
tools/benchmark-harness/fixtures/csv/data_table.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/csv/data_table.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 476,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "csv test: data_table",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/data_table.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/data_table.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/stanley_cups.json
Normal file
16
tools/benchmark-harness/fixtures/csv/stanley_cups.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/csv/stanley_cups.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 91,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "csv test: stanley_cups",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/stanley_cups.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/stanley_cups.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/csv/test_mskanji.json
Normal file
16
tools/benchmark-harness/fixtures/csv/test_mskanji.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/csv/test_mskanji.csv",
|
||||
"file_type": "csv",
|
||||
"file_size": 70,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/csv/test_mskanji.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/csv/test_mskanji.md"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/dbf_stations.json
Normal file
14
tools/benchmark-harness/fixtures/dbf_stations.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/dbf/stations.dbf",
|
||||
"file_type": "dbf",
|
||||
"file_size": 87623,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "dBASE file with station records",
|
||||
"category": "tables"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/dbf/dbf_stations.txt",
|
||||
"source": "manual"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/dbk_chapter.json
Normal file
16
tools/benchmark-harness/fixtures/dbk_chapter.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-chapter.dbk",
|
||||
"file_type": "dbk",
|
||||
"file_size": 1088,
|
||||
"expected_frameworks": ["kreuzberg", "pandoc"],
|
||||
"metadata": {
|
||||
"description": "DocBook XML chapter with recursive sections (DBK extension)",
|
||||
"category": "markup",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/djot_tables.json
Normal file
14
tools/benchmark-harness/fixtures/djot_tables.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/markdown/tables.djot",
|
||||
"file_type": "djot",
|
||||
"file_size": 2102,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "Djot markup with tables",
|
||||
"category": "markup"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/djot/djot_tables.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/duplicate-paragraphs.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 18432,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/duplicate-paragraphs.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/fake-doc-emphasized-text.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 27648,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/fake-doc-emphasized-text.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/fake.json
Normal file
16
tools/benchmark-harness/fixtures/doc/fake.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/fake.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 18432,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/fake.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/fake.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/simple.json
Normal file
16
tools/benchmark-harness/fixtures/doc/simple.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/doc/simple.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 15872,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/simple.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/simple.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/doc/unit_test_lists.json
Normal file
16
tools/benchmark-harness/fixtures/doc/unit_test_lists.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/doc/unit_test_lists.doc",
|
||||
"file_type": "doc",
|
||||
"file_size": 16384,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "doc test: unit_test_lists",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/doc/unit_test_lists.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_chapter.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_chapter.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-chapter.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 1088,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook chapter with structured content",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-chapter.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/docbook_reader.json
Normal file
14
tools/benchmark-harness/fixtures/docbook_reader.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-reader.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 37139,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook reader with larger content",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook_reader.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_tables4.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_tables4.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/tables.docbook4",
|
||||
"file_type": "docbook",
|
||||
"file_size": 7502,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook 4 table examples with simple, multiline, and headerless tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_tables5.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_tables5.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/tables.docbook5",
|
||||
"file_type": "docbook",
|
||||
"file_size": 7502,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook 5 table examples with simple, multiline, and headerless tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/tables.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docbook_xref.json
Normal file
15
tools/benchmark-harness/fixtures/docbook_xref.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docbook/docbook-xref.docbook",
|
||||
"file_type": "docbook",
|
||||
"file_size": 3129,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DocBook cross-reference examples with XRef, links, figures, and tables",
|
||||
"category": "documentation"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docbook/docbook-xref.txt",
|
||||
"markdown_file": "../../../test_documents/ground_truth/docbook/docbook-xref.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_grouped_images.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 207463,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_grouped_images.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_rich_cells.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/docx_rich_cells.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 24320,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_rich_cells.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/docx_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/docx_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12725,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: docx_tables",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/docx_tables.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/docx_tables.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
16
tools/benchmark-harness/fixtures/docx/equations.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15814,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "DOCX test document: equations",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/equations.md",
|
||||
"source": "pandoc",
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/equations.txt"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/extraction_test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/extraction_test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 11296,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: extraction_test",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/extraction_test.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/extraction_test.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
16
tools/benchmark-harness/fixtures/docx/fake.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/fake.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 36602,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: fake",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/fake.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/fake.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/issue_359_list_whitespace.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 9170,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: issue_359_list_whitespace",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/issue_359_list_whitespace.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/list_after_num_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15698,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/list_after_num_headers.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
16
tools/benchmark-harness/fixtures/docx/lorem_ipsum.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/lorem_ipsum.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14817,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/lorem_ipsum.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
16
tools/benchmark-harness/fixtures/docx/rlink.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/rlink.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13708,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/rlink.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/rlink.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
16
tools/benchmark-harness/fixtures/docx/sample_document.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/docx/sample_document.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg"],
|
||||
"metadata": {
|
||||
"description": "docx test: sample_document",
|
||||
"source": "pandoc-generated",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/sample_document.txt",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/sample_document.md",
|
||||
"source": "pandoc"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/table_with_equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14228,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/table_with_equations.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/table_with_equations.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
16
tools/benchmark-harness/fixtures/docx/tablecell.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/tablecell.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15180,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/tablecell.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/tablecell.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 135824,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_emf_docx.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/test_emf_docx.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 426097,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_emf_docx.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
16
tools/benchmark-harness/fixtures/docx/test_with_comment.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/markitdown/docx/test_with_comment.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 12971,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from markitdown test suite",
|
||||
"source": "markitdown",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/test_with_comment.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/test_with_comment.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
16
tools/benchmark-harness/fixtures/docx/textbox.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/textbox.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 49206,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/textbox.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/textbox.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_formatting.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 29099,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_formatting.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_headers.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 13903,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_headers_numbered.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16880,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_headers_numbered.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
16
tools/benchmark-harness/fixtures/docx/unit_test_lists.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/unit_test_lists.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15769,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/unit_test_lists.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_comments.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_comments.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 37399,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_comments.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_comments.md"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_image_anchors.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 18560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_image_anchors.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_sample.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_sample.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 103966,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_sample.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_sample.md"
|
||||
}
|
||||
}
|
||||
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
16
tools/benchmark-harness/fixtures/docx/word_tables.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/docling/docx/word_tables.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 16404,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from docling test suite",
|
||||
"source": "docling",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/docx/word_tables.txt",
|
||||
"source": "pandoc",
|
||||
"markdown_file": "../../../../test_documents/ground_truth/docx/word_tables.md"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_equations.json
Normal file
15
tools/benchmark-harness/fixtures/docx_equations.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/equations.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 15017,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "DOCX with mathematical equations - 15KB document with complex formatting",
|
||||
"category": "docx-equations",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_equations.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_images.json
Normal file
15
tools/benchmark-harness/fixtures/docx_images.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/word_image_anchors.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 18560,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "DOCX with embedded images and anchors - 18KB document",
|
||||
"category": "docx-images",
|
||||
"size_class": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_images.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/docx_large_formatted.json
Normal file
15
tools/benchmark-harness/fixtures/docx_large_formatted.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/test_emf_docx.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 426097,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Large formatted DOCX - 416KB document with EMF graphics",
|
||||
"category": "docx-complex",
|
||||
"size_class": "medium"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_large_formatted.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/docx_simple.json
Normal file
14
tools/benchmark-harness/fixtures/docx_simple.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/docx/lorem_ipsum.docx",
|
||||
"file_type": "docx",
|
||||
"file_size": 14817,
|
||||
"expected_frameworks": ["kreuzberg", "docling", "markitdown", "pandoc", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Simple DOCX - Lorem ipsum text",
|
||||
"category": "text"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/docx/docx_simple.txt",
|
||||
"source": "vision"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-equals-attachment-filename.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 3297,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-equals-attachment-filename.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-inline-content-disposition.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 657,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-inline-content-disposition.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-html-content-1.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 7721,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-html-content-1.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2008-07-16.062410.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 31978,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2008-07-16.062410.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-no-utf8-2014-03-17.111517.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 14954,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-no-utf8-2014-03-17.111517.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-1.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 16085,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-1.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-2.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 26271,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-2.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-3.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 56028,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-3.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-4.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 34433,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-4.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-replace-mime-encodings-error-5.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 14567,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-replace-mime-encodings-error-5.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/email-with-image.json
Normal file
15
tools/benchmark-harness/fixtures/eml/email-with-image.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/email-with-image.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 296696,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/email-with-image.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-attachment.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1704,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-attachment.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-b64.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-b64.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-b64.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 979,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-b64.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-header.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-header.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-header.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1207,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-header.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-image-embedded.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 297126,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-image-embedded.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-malformed-encoding.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 898,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-malformed-encoding.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-be.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1614,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-be.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16-le.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1614,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16-le.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email-utf-16.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email-utf-16.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email-utf-16.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1616,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email-utf-16.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-email.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-email.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-email.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 807,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-email.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/fake-encrypted.json
Normal file
15
tools/benchmark-harness/fixtures/eml/fake-encrypted.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/fake-encrypted.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 669,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/fake-encrypted.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/family-day.json
Normal file
15
tools/benchmark-harness/fixtures/eml/family-day.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/family-day.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1291,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/family-day.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-attach-mp3.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-attach-mp3.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-attach-mp3.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 70911,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-attach-mp3.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-different-plain-html.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1397,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-different-plain-html.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-html-only.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-html-only.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-html-only.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 640,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-html-only.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multi-to-cc-bcc.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 350,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-multi-to-cc-bcc.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-multipart-digest.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 721,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-multipart-digest.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-body.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-body.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-body.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 985,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-body.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-subject.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-subject.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-subject.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 162,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-subject.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-no-to.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-no-to.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-no-to.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 264,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-no-to.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/mime-simple.json
Normal file
15
tools/benchmark-harness/fixtures/eml/mime-simple.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-simple.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 452,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-simple.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/mime-word-encoded-subject.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 261,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/mime-word-encoded-subject.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/rfc822-no-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/rfc822-no-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/rfc822-no-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 232,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/rfc822-no-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/simple-rfc-822.json
Normal file
15
tools/benchmark-harness/fixtures/eml/simple-rfc-822.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/simple-rfc-822.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 679,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/simple-rfc-822.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-invalid-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-invalid-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-invalid-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 161,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-invalid-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-iso-8601-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-iso-8601-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-iso-8601-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 135,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-iso-8601-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
15
tools/benchmark-harness/fixtures/eml/test-rfc2822-date.json
Normal file
15
tools/benchmark-harness/fixtures/eml/test-rfc2822-date.json
Normal file
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"document": "../../../../test_documents/vendored/unstructured/eml/test-rfc2822-date.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 151,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Document from unstructured test suite",
|
||||
"source": "unstructured",
|
||||
"size_category": "small"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../../test_documents/ground_truth/eml/test-rfc2822-date.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
14
tools/benchmark-harness/fixtures/eml_attachments.json
Normal file
14
tools/benchmark-harness/fixtures/eml_attachments.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"document": "../../../test_documents/email/mailgun_pdf_attachment.eml",
|
||||
"file_type": "eml",
|
||||
"file_size": 1514,
|
||||
"expected_frameworks": ["kreuzberg", "tika", "unstructured"],
|
||||
"metadata": {
|
||||
"description": "Email with PDF attachment from Mailgun",
|
||||
"category": "attachments"
|
||||
},
|
||||
"ground_truth": {
|
||||
"text_file": "../../../test_documents/ground_truth/eml/eml_attachments.txt",
|
||||
"source": "python_email"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user