Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/crates/kreuzberg-cli/tests/commands_test.rs
+++ b/crates/kreuzberg-cli/tests/commands_test.rs
@@ -0,0 +1,937 @@
+//! Integration tests for CLI commands (extract, detect, batch).
+//!
+//! These tests verify that the CLI commands work correctly end-to-end,
+//! including input validation, file processing, and output formatting.
+
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::tempdir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+/// Build the binary before running tests.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+#[test]
+fn test_extract_text_file() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Extract output should not be empty");
+}
+
+#[test]
+fn test_extract_with_json_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    // JSON output is now wrapped in a timing envelope: { result: ExtractionResult, extraction_time_ms: f64 }
+    assert!(json.get("result").is_some(), "JSON envelope should have 'result' field");
+    assert!(
+        json.get("extraction_time_ms").is_some(),
+        "JSON envelope should have 'extraction_time_ms' field"
+    );
+    assert!(
+        json["result"].get("content").is_some(),
+        "result should have 'content' field"
+    );
+    assert!(
+        json["result"].get("mime_type").is_some(),
+        "result should have 'mime_type' field"
+    );
+}
+
+#[test]
+fn test_extract_with_chunking() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--chunk",
+            "true",
+            "--chunk-size",
+            "100",
+            "--chunk-overlap",
+            "20",
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        output.status.success(),
+        "Extract with chunking failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    let json: serde_json::Value = serde_json::from_str(&stdout).expect("Should be valid JSON");
+
+    // JSON output is wrapped in an envelope; chunks live under result
+    assert!(
+        json["result"].get("chunks").is_some(),
+        "result should have 'chunks' field"
+    );
+    assert!(json["result"]["chunks"].is_array(), "'chunks' should be an array");
+}
+
+#[test]
+fn test_extract_file_not_found() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for nonexistent file");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_directory_not_file() {
+    build_binary();
+
+    let tmp_dir = tempdir().expect("Failed to create temp dir");
+    let dir_path = tmp_dir.path().to_string_lossy().to_string();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", dir_path.as_str()])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for directory");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("not a file") || stderr.contains("regular file"),
+        "Error should mention path is not a file, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_chunk_size_zero() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--chunk-size", "0"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for chunk size 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk size") || stderr.contains("must be greater than 0"),
+        "Error should mention invalid chunk size, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_chunk_size_too_large() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--chunk-size", "2000000"])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Extract should fail for chunk size > 1M");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk size") || stderr.contains("1,000,000"),
+        "Error should mention chunk size limit, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_invalid_overlap_equals_chunk_size() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--chunk-size",
+            "100",
+            "--chunk-overlap",
+            "100",
+        ])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(
+        !output.status.success(),
+        "Extract should fail when overlap equals chunk size"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("Invalid chunk overlap") || stderr.contains("must be less than chunk size"),
+        "Error should mention overlap constraint, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_detect_mime_type() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", test_file.as_str()])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Detect output should not be empty");
+    assert!(
+        stdout.contains("text/plain") || stdout.contains("text"),
+        "Should detect text MIME type, got: {}",
+        stdout
+    );
+}
+
+#[test]
+fn test_detect_with_json_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    assert!(json.get("mime_type").is_some(), "JSON should have 'mime_type' field");
+    assert!(json.get("path").is_some(), "JSON should have 'path' field");
+}
+
+#[test]
+fn test_detect_file_not_found() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(!output.status.success(), "Detect should fail for nonexistent file");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_batch_multiple_files() {
+    build_binary();
+
+    let file1 = get_test_file("text/simple.txt");
+    let file2 = get_test_file("text/simple.txt");
+
+    if !PathBuf::from(&file1).exists() {
+        tracing::debug!("Skipping test: {} not found", file1);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", file1.as_str(), file2.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute batch command");
+
+    assert!(
+        output.status.success(),
+        "Batch command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    let json_result: serde_json::Result<serde_json::Value> = serde_json::from_str(&stdout);
+    assert!(json_result.is_ok(), "Output should be valid JSON, got: {}", stdout);
+
+    let json = json_result.unwrap();
+    // Batch JSON output is now wrapped in a timing envelope: { results: [...], total_ms, per_file_ms }
+    assert!(
+        json.get("results").is_some(),
+        "Batch envelope should have 'results' field"
+    );
+    assert!(json["results"].is_array(), "'results' should be a JSON array");
+    assert_eq!(json["results"].as_array().unwrap().len(), 2, "Should have 2 results");
+}
+
+#[test]
+fn test_batch_with_missing_file() {
+    build_binary();
+
+    let valid_file = get_test_file("text/simple.txt");
+
+    if !PathBuf::from(&valid_file).exists() {
+        tracing::debug!("Skipping test: {} not found", valid_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", valid_file.as_str(), "/nonexistent/file.txt"])
+        .output()
+        .expect("Failed to execute batch command");
+
+    assert!(!output.status.success(), "Batch should fail when one file is missing");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("File not found") || stderr.contains("Invalid file"),
+        "Error should mention file not found, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Extract text from a document"));
+    assert!(stdout.contains("--chunk-size"));
+    assert!(stdout.contains("--chunk-overlap"));
+}
+
+#[test]
+fn test_detect_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["detect", "--help"])
+        .output()
+        .expect("Failed to execute detect --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Detect MIME type"));
+}
+
+#[test]
+fn test_batch_help() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["batch", "--help"])
+        .output()
+        .expect("Failed to execute batch --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Batch extract from multiple documents"));
+}
+
+// ── Extract command flag parsing tests ──────────────────────────────
+
+#[test]
+fn test_extract_help_shows_all_extraction_override_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Verify all ExtractionOverrides flags appear in help output
+    let expected_flags = [
+        "--ocr",
+        "--ocr-backend",
+        "--ocr-language",
+        "--force-ocr",
+        "--no-cache",
+        "--ocr-auto-rotate",
+        "--chunk",
+        "--chunk-size",
+        "--chunk-overlap",
+        "--chunking-tokenizer",
+        "--content-format",
+        "--include-structure",
+        "--quality",
+        "--detect-language",
+        "--layout",
+        "--layout-confidence",
+        "--layout-table-model",
+        "--acceleration",
+        "--max-concurrent",
+        "--max-threads",
+        "--extract-pages",
+        "--page-markers",
+        "--extract-images",
+        "--target-dpi",
+        "--pdf-password",
+        "--token-reduction",
+        "--msg-codepage",
+    ];
+
+    for flag in &expected_flags {
+        assert!(
+            stdout.contains(flag),
+            "Extract --help should show flag '{}', but it was not found in output:\n{}",
+            flag,
+            stdout
+        );
+    }
+}
+
+// ── Batch command flag parity test ──────────────────────────────────
+
+#[test]
+fn test_batch_has_same_extraction_flags_as_extract() {
+    build_binary();
+
+    let extract_output = Command::new(get_binary_path())
+        .args(["extract", "--help"])
+        .output()
+        .expect("Failed to execute extract --help");
+
+    let batch_output = Command::new(get_binary_path())
+        .args(["batch", "--help"])
+        .output()
+        .expect("Failed to execute batch --help");
+
+    assert!(extract_output.status.success());
+    assert!(batch_output.status.success());
+
+    let extract_help = String::from_utf8_lossy(&extract_output.stdout);
+    let batch_help = String::from_utf8_lossy(&batch_output.stdout);
+
+    // All extraction override flags should be present on both commands
+    let shared_flags = [
+        "--ocr",
+        "--ocr-backend",
+        "--ocr-language",
+        "--force-ocr",
+        "--no-cache",
+        "--chunk",
+        "--chunk-size",
+        "--chunk-overlap",
+        "--content-format",
+        "--quality",
+        "--detect-language",
+        "--layout",
+        "--layout-confidence",
+        "--layout-table-model",
+        "--acceleration",
+        "--max-concurrent",
+        "--max-threads",
+        "--extract-pages",
+        "--page-markers",
+        "--extract-images",
+        "--target-dpi",
+        "--pdf-password",
+        "--token-reduction",
+        "--msg-codepage",
+    ];
+
+    for flag in &shared_flags {
+        assert!(
+            extract_help.contains(flag),
+            "Extract should have flag '{}' but it's missing",
+            flag
+        );
+        assert!(
+            batch_help.contains(flag),
+            "Batch should have flag '{}' (parity with extract) but it's missing",
+            flag
+        );
+    }
+}
+
+// ── Validation error tests ──────────────────────────────────────────
+//
+// NOTE: The CLI validates file existence *before* override validation,
+// so we must provide a real file to reach the override validation stage.
+
+/// Create a temporary file and return its path as a String.
+/// The caller must keep the returned `tempfile::TempDir` alive for the
+/// duration of the test so the file is not deleted.
+fn create_temp_file() -> (tempfile::TempDir, String) {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let file_path = dir.path().join("dummy.pdf");
+    std::fs::write(&file_path, b"dummy content").expect("Failed to write temp file");
+    let path_str = file_path.to_string_lossy().to_string();
+    (dir, path_str)
+}
+
+#[test]
+fn test_extract_chunk_size_zero_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when chunk size is 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("Invalid chunk size"),
+        "Error should mention chunk size, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_chunk_overlap_exceeds_size_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "10", "--chunk-overlap", "20", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when overlap exceeds chunk size");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("overlap") || stderr.contains("Overlap") || stderr.contains("Invalid chunk overlap"),
+        "Error should mention overlap constraint, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_layout_confidence_out_of_range_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--layout-confidence", "2.0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    // This flag is feature-gated behind layout-detection. If the binary was
+    // built without that feature, clap itself will reject the unknown flag.
+    assert!(
+        !output.status.success(),
+        "Should fail for layout confidence out of range"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("confidence") || stderr.contains("layout") || stderr.contains("unexpected argument"),
+        "Error should mention confidence or layout, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_layout_false_with_confidence_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--layout", "false", "--layout-confidence", "0.5", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    // If layout-detection feature is enabled, validation should reject this combination.
+    // If not enabled, clap rejects the unknown flags.
+    assert!(
+        !output.status.success(),
+        "Should fail when --layout false is combined with --layout-confidence"
+    );
+}
+
+#[test]
+fn test_extract_target_dpi_zero_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--target-dpi", "0", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when target DPI is 0");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("target") || stderr.contains("Invalid"),
+        "Error should mention DPI range, got: {}",
+        stderr
+    );
+}
+
+// ── Completions test ────────────────────────────────────────────────
+
+#[test]
+fn test_completions_bash_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "bash"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Completions output should not be empty");
+    // bash completions should contain the command name
+    assert!(
+        stdout.contains("kreuzberg"),
+        "Bash completions should reference 'kreuzberg', got: {}",
+        &stdout[..stdout.len().min(200)]
+    );
+}
+
+#[test]
+fn test_completions_zsh_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "zsh"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Zsh completions output should not be empty");
+}
+
+#[test]
+fn test_completions_fish_produces_output() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["completions", "fish"])
+        .output()
+        .expect("Failed to execute completions command");
+
+    assert!(
+        output.status.success(),
+        "Completions command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Fish completions output should not be empty");
+}
+
+// ── Embed help test ─────────────────────────────────────────────────
+
+#[test]
+fn test_embed_help_shows_correct_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["embed", "--help"])
+        .output()
+        .expect("Failed to execute embed --help");
+
+    // embed is feature-gated; if not compiled, clap will show an error
+    if !output.status.success() {
+        // If embed subcommand doesn't exist, skip the test
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        if stderr.contains("unrecognized subcommand") || stderr.contains("invalid subcommand") {
+            return;
+        }
+    }
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        stdout.contains("--text"),
+        "Embed help should show --text flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--preset"),
+        "Embed help should show --preset flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--format"),
+        "Embed help should show --format flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("Generate embeddings"),
+        "Embed help should describe embedding generation, got: {}",
+        stdout
+    );
+}
+
+// ── Chunk help test ─────────────────────────────────────────────────
+
+#[test]
+fn test_chunk_help_shows_correct_flags() {
+    build_binary();
+
+    let output = Command::new(get_binary_path())
+        .args(["chunk", "--help"])
+        .output()
+        .expect("Failed to execute chunk --help");
+
+    assert!(
+        output.status.success(),
+        "Chunk --help failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        stdout.contains("--text"),
+        "Chunk help should show --text flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunk-size"),
+        "Chunk help should show --chunk-size flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunk-overlap"),
+        "Chunk help should show --chunk-overlap flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--chunker-type"),
+        "Chunk help should show --chunker-type flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("--format"),
+        "Chunk help should show --format flag, got: {}",
+        stdout
+    );
+    assert!(
+        stdout.contains("Chunk text"),
+        "Chunk help should describe text chunking, got: {}",
+        stdout
+    );
+}
+
+// ── Style module NO_COLOR test ──────────────────────────────────────
+
+#[test]
+fn test_no_color_env_disables_ansi_in_output() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        return;
+    }
+
+    // Run with NO_COLOR set - output should have no ANSI escape sequences
+    let output = Command::new(get_binary_path())
+        .env("NO_COLOR", "1")
+        .args(["detect", &test_file])
+        .output()
+        .expect("Failed to execute detect command");
+
+    assert!(
+        output.status.success(),
+        "Detect failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(
+        !stdout.contains("\x1b["),
+        "Output should not contain ANSI escape sequences when NO_COLOR is set, got: {:?}",
+        stdout
+    );
+}
+
+// ── Additional validation edge cases ────────────────────────────────
+
+#[test]
+fn test_extract_chunk_size_too_large_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--chunk-size", "2000000", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when chunk size exceeds limit");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("chunk size") || stderr.contains("Chunk size") || stderr.contains("1,000,000"),
+        "Error should mention chunk size limit, got: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_extract_target_dpi_too_high_error() {
+    build_binary();
+    let (_dir, file_path) = create_temp_file();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--target-dpi", "5000", &file_path])
+        .output()
+        .expect("Failed to execute extract command");
+
+    assert!(!output.status.success(), "Should fail when target DPI exceeds limit");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("DPI") || stderr.contains("dpi") || stderr.contains("2400") || stderr.contains("Invalid"),
+        "Error should mention DPI range, got: {}",
+        stderr
+    );
+}
--- a/crates/kreuzberg-cli/tests/config_discovery_test.rs
+++ b/crates/kreuzberg-cli/tests/config_discovery_test.rs
@@ -0,0 +1,617 @@
+//! Integration tests for CLI config file discovery.
+//!
+//! These tests verify that the CLI correctly discovers and loads configuration files
+//! in various formats (.toml, .yaml, .json) with case-insensitive extension
+//! matching, explicit --config flag support, and proper error handling.
+
+use std::fs;
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::tempdir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Build the binary before running tests.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+#[test]
+fn test_discover_kreuzberg_toml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+enable_quality_processing = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_yaml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_yml_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_discover_kreuzberg_json_in_current_directory() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join(".kreuzberg.json");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false,
+    "enable_quality_processing": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_toml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.TOML");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_yaml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.Yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_yml_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.YML");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_case_insensitive_json_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom.JSON");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_toml() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = false
+enable_quality_processing = false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_yaml() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.yaml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache: false
+enable_quality_processing: false
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_explicit_config_path_json() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("custom_config.json");
+
+    fs::write(
+        &config_path,
+        r#"{
+    "use_cache": false,
+    "enable_quality_processing": false
+}"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_invalid_config_extension() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("config.txt");
+
+    fs::write(&config_path, "invalid content").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains(".toml") || stderr.contains(".yaml") || stderr.contains(".json"),
+        "Error message should mention supported extensions: {}",
+        stderr
+    );
+}
+
+#[test]
+fn test_malformed_toml_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.toml");
+
+    fs::write(&config_path, "use_cache = [[[[[").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_malformed_yaml_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.yaml");
+
+    fs::write(&config_path, "use_cache: [[[[[").unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_malformed_json_config() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("bad_config.json");
+
+    fs::write(&config_path, r#"{"use_cache": [[[[[}"#).unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_nonexistent_config_file() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("nonexistent.toml");
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
+
+#[test]
+fn test_default_config_when_no_file_found() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .current_dir(dir.path())
+        .args(["extract", test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(
+        output.status.success(),
+        "Command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+#[test]
+fn test_invalid_config_values() {
+    build_binary();
+
+    let dir = tempdir().unwrap();
+    let config_path = dir.path().join("invalid.toml");
+
+    fs::write(
+        &config_path,
+        r#"
+use_cache = "not_a_bool"
+"#,
+    )
+    .unwrap();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        tracing::debug!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let config_arg = config_path.to_string_lossy().into_owned();
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", "--config", config_arg.as_str(), test_file.as_str()])
+        .output()
+        .expect("Failed to execute kreuzberg");
+
+    assert!(!output.status.success());
+}
--- a/crates/kreuzberg-cli/tests/config_env_overrides_test.rs
+++ b/crates/kreuzberg-cli/tests/config_env_overrides_test.rs
@@ -0,0 +1,46 @@
+//! Regression test for issue #773.
+//! Validates that environment variable overrides are correctly applied during configuration loading.
+
+use kreuzberg::{EmbeddingModelType, ExtractionConfig};
+
+#[test]
+fn test_regression_773_env_override_loading() {
+    let mut config = ExtractionConfig::default();
+
+    if let Some(ref ocr) = config.ocr {
+        assert_ne!(ocr.language, "fra");
+    }
+
+    unsafe { std::env::set_var("KREUZBERG_OCR_LANGUAGE", "fra") };
+    config.apply_env_overrides().expect("Failed to apply overrides");
+    unsafe { std::env::remove_var("KREUZBERG_OCR_LANGUAGE") };
+
+    let ocr = config
+        .ocr
+        .expect("OCR config should be Some when KREUZBERG_OCR_LANGUAGE is set");
+    assert_eq!(ocr.language, "fra");
+}
+
+#[test]
+fn test_regression_773_vlm_embedding_env_override() {
+    let mut config = ExtractionConfig::default();
+
+    unsafe { std::env::set_var("KREUZBERG_VLM_EMBEDDING_MODEL", "openai/text-embedding-3-small") };
+    config
+        .apply_env_overrides()
+        .expect("Failed to apply environment overrides");
+    unsafe { std::env::remove_var("KREUZBERG_VLM_EMBEDDING_MODEL") };
+
+    let chunking = config
+        .chunking
+        .expect("Chunking should be enabled when VLM embedding is set");
+    let embedding = chunking.embedding.expect("Embedding should be configured");
+
+    match embedding.model {
+        EmbeddingModelType::Llm { llm } => {
+            assert_eq!(llm.model, "openai/text-embedding-3-small");
+            assert!(llm.api_key.is_none());
+        }
+        _ => panic!("Expected Llm embedding model type"),
+    }
+}
--- a/crates/kreuzberg-cli/tests/config_tests.rs
+++ b/crates/kreuzberg-cli/tests/config_tests.rs
@@ -0,0 +1,344 @@
+//! CLI configuration tests validating flags, aliases, and deprecation handling.
+//!
+//! This test suite verifies that:
+//! 1. --output-format flag works correctly for all format options
+//! 2. CLI flags properly override config file settings
+//! 3. Config merge precedence is maintained (CLI args > config file > defaults)
+//! 4. Configuration JSON can be passed inline
+//! 5. Alias handling for deprecated flags works as expected
+
+#![allow(clippy::bool_assert_comparison)]
+#![allow(clippy::field_reassign_with_default)]
+
+use std::path::PathBuf;
+use tempfile::TempDir;
+
+/// Helper to create a temporary config file
+#[allow(dead_code)]
+fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
+    let config_path = dir.path().join(name);
+    std::fs::write(&config_path, content).expect("Failed to write config file");
+    config_path
+}
+
+#[test]
+fn test_output_format_flag_plain() {
+    // Test that --output-format plain works
+    // This test verifies the flag is properly recognized
+
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Plain,
+        "Default output format should be Plain"
+    );
+}
+
+#[test]
+fn test_output_format_flag_markdown() {
+    // Test that --output-format markdown is parsed correctly
+    let markdown_format = kreuzberg::core::config::OutputFormat::Markdown;
+    assert_eq!(
+        format!("{:?}", markdown_format),
+        "Markdown",
+        "Markdown format should have correct debug representation"
+    );
+}
+
+#[test]
+fn test_output_format_flag_html() {
+    // Test that --output-format html is parsed correctly
+    let html_format = kreuzberg::core::config::OutputFormat::Html;
+    assert_eq!(
+        format!("{:?}", html_format),
+        "Html",
+        "Html format should have correct debug representation"
+    );
+}
+
+#[test]
+fn test_extraction_config_with_output_format() {
+    // Test that ExtractionConfig can be created with specific output_format
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        "output_format should be Markdown after assignment"
+    );
+
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+    assert_eq!(
+        serialized["output_format"], "markdown",
+        "Serialized output_format should be 'markdown' (lowercase)"
+    );
+}
+
+#[test]
+fn test_config_json_parsing_complete() {
+    // Test that complete JSON config can be parsed
+    let json = serde_json::json!({
+        "use_cache": true,
+        "enable_quality_processing": true,
+        "force_ocr": false,
+        "output_format": "markdown",
+        "result_format": "unified",
+        "max_concurrent_extractions": 4,
+    });
+
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json).expect("Failed to parse config JSON");
+
+    assert!(config.use_cache);
+    assert!(config.enable_quality_processing);
+    assert_eq!(config.force_ocr, false);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+    assert_eq!(config.max_concurrent_extractions, Some(4));
+}
+
+#[test]
+fn test_config_merge_precedence_cli_overrides_default() {
+    // Test that CLI arguments override defaults
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    // Simulate CLI override
+    config.use_cache = false;
+    config.force_ocr = true;
+
+    assert_eq!(config.use_cache, false, "CLI override should change use_cache to false");
+    assert_eq!(config.force_ocr, true, "CLI override should change force_ocr to true");
+}
+
+#[test]
+fn test_config_merge_precedence_cli_overrides_file() {
+    // Test that CLI arguments override config file settings
+    let mut file_config = kreuzberg::core::config::ExtractionConfig::default();
+    file_config.use_cache = true;
+    file_config.force_ocr = false;
+
+    // Simulate CLI override
+    let mut final_config = file_config.clone();
+    final_config.use_cache = false;
+
+    assert_eq!(
+        final_config.use_cache, false,
+        "CLI should override file config for use_cache"
+    );
+    assert!(!final_config.force_ocr, "CLI should not affect fields not overridden");
+}
+
+#[test]
+fn test_config_file_precedence_over_defaults() {
+    // Test that config file values override defaults
+    let json = serde_json::json!({
+        "use_cache": false,
+        "force_ocr": true,
+    });
+
+    let file_config: kreuzberg::core::config::ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
+
+    let default_config = kreuzberg::core::config::ExtractionConfig::default();
+
+    assert_ne!(
+        file_config.use_cache, default_config.use_cache,
+        "File config should override default for use_cache"
+    );
+    assert_ne!(
+        file_config.force_ocr, default_config.force_ocr,
+        "File config should override default for force_ocr"
+    );
+}
+
+#[test]
+fn test_output_format_serialization() {
+    // Test that output_format serializes to expected string values
+    let plain = kreuzberg::core::config::OutputFormat::Plain;
+    let plain_json = serde_json::to_value(plain).expect("Failed to serialize Plain");
+    assert_eq!(plain_json, "plain");
+
+    let markdown = kreuzberg::core::config::OutputFormat::Markdown;
+    let markdown_json = serde_json::to_value(markdown).expect("Failed to serialize Markdown");
+    assert_eq!(markdown_json, "markdown");
+
+    let html = kreuzberg::core::config::OutputFormat::Html;
+    let html_json = serde_json::to_value(html).expect("Failed to serialize Html");
+    assert_eq!(html_json, "html");
+}
+
+#[test]
+fn test_output_format_deserialization() {
+    // Test that output_format can be deserialized from string values
+    let plain: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("plain")).expect("Failed to deserialize plain");
+    assert_eq!(plain, kreuzberg::core::config::OutputFormat::Plain);
+
+    let markdown: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("markdown")).expect("Failed to deserialize markdown");
+    assert_eq!(markdown, kreuzberg::core::config::OutputFormat::Markdown);
+
+    let html: kreuzberg::core::config::OutputFormat =
+        serde_json::from_value(serde_json::json!("html")).expect("Failed to deserialize html");
+    assert_eq!(html, kreuzberg::core::config::OutputFormat::Html);
+}
+
+#[test]
+fn test_extraction_config_roundtrip_with_output_format() {
+    // Test that output_format survives serialization roundtrip
+    let original = kreuzberg::core::config::ExtractionConfig {
+        output_format: kreuzberg::core::config::OutputFormat::Markdown,
+        ..kreuzberg::core::config::ExtractionConfig::default()
+    };
+
+    let json_string = serde_json::to_string(&original).expect("Failed to serialize");
+    let restored: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_str(&json_string).expect("Failed to deserialize");
+
+    assert_eq!(
+        original.output_format, restored.output_format,
+        "output_format should survive serialization roundtrip"
+    );
+}
+
+#[test]
+fn test_config_with_all_output_formats() {
+    // Test that all output format variants can be set and retrieved
+    let formats = vec![
+        kreuzberg::core::config::OutputFormat::Plain,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        kreuzberg::core::config::OutputFormat::Html,
+    ];
+
+    for format in formats {
+        let config = kreuzberg::core::config::ExtractionConfig {
+            output_format: format.clone(),
+            ..kreuzberg::core::config::ExtractionConfig::default()
+        };
+
+        let json = serde_json::to_value(&config).expect("Failed to serialize");
+        let restored: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(json).expect("Failed to deserialize");
+
+        assert_eq!(
+            format, restored.output_format,
+            "Format should be preserved for {:?}",
+            format
+        );
+    }
+}
+
+#[test]
+fn test_config_partial_json_with_output_format() {
+    // Test that partial JSON config with only output_format is valid
+    let json = serde_json::json!({
+        "output_format": "markdown",
+    });
+
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json).expect("Failed to parse partial config");
+
+    assert_eq!(
+        config.output_format,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        "output_format should be set from partial config"
+    );
+
+    // Other fields should have defaults
+    assert!(config.use_cache, "use_cache should have default value");
+}
+
+#[test]
+fn test_config_complete_json_structure() {
+    // Test that a complete config JSON has all necessary fields
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+    let obj = json.as_object().expect("Should be object");
+
+    // Verify critical fields are present
+    assert!(obj.contains_key("output_format"), "Should have output_format");
+    assert!(obj.contains_key("use_cache"), "Should have use_cache");
+    assert!(
+        obj.contains_key("enable_quality_processing"),
+        "Should have enable_quality_processing"
+    );
+    assert!(obj.contains_key("force_ocr"), "Should have force_ocr");
+    assert!(obj.contains_key("result_format"), "Should have result_format");
+}
+
+#[test]
+fn test_unknown_output_format_accepted_as_custom() {
+    // OutputFormat has a Custom(String) catch-all variant with #[serde(untagged)],
+    // so unknown strings are accepted as custom renderer names rather than rejected.
+    let json = serde_json::json!({
+        "output_format": "my_custom_renderer",
+    });
+
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(json);
+
+    assert!(
+        result.is_ok(),
+        "Unknown output_format should be accepted as Custom variant; got: {:?}",
+        result.err()
+    );
+    assert_eq!(
+        result.unwrap().output_format,
+        kreuzberg::core::config::OutputFormat::Custom("my_custom_renderer".to_string()),
+        "Unknown format string must deserialize as OutputFormat::Custom"
+    );
+}
+
+#[test]
+fn test_config_case_sensitivity() {
+    // Test that format values are case-insensitive due to rename_all = "lowercase"
+    let plain_lowercase = serde_json::json!({"output_format": "plain"});
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(plain_lowercase);
+
+    assert!(result.is_ok(), "lowercase 'plain' should be accepted");
+    let config = result.unwrap();
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
+}
+
+#[test]
+fn test_output_format_field_is_required_in_serialization() {
+    // Test that output_format is always included in serialization
+    let config = kreuzberg::core::config::ExtractionConfig::default();
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+
+    assert!(
+        json.get("output_format").is_some(),
+        "output_format should always be present in serialization"
+    );
+}
+
+#[test]
+fn test_result_format_and_output_format_independent() {
+    // Test that result_format and output_format are independent fields
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+
+    // Set both to different values
+    config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
+
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+
+    assert_eq!(json["output_format"], "markdown");
+    assert!(
+        json["result_format"].is_string(),
+        "result_format should also be present"
+    );
+}
+
+#[test]
+fn test_extraction_config_clone_preserves_format() {
+    // Test that cloning config preserves output_format
+    let original = kreuzberg::core::config::ExtractionConfig {
+        output_format: kreuzberg::core::config::OutputFormat::Html,
+        ..kreuzberg::core::config::ExtractionConfig::default()
+    };
+
+    let cloned = original.clone();
+
+    assert_eq!(
+        original.output_format, cloned.output_format,
+        "Cloned config should preserve output_format"
+    );
+}
--- a/crates/kreuzberg-cli/tests/contract_cli.rs
+++ b/crates/kreuzberg-cli/tests/contract_cli.rs
@@ -0,0 +1,355 @@
+//! CLI contract tests - verify CLI config parsing matches Rust core
+//!
+//! This test suite validates that the CLI's configuration parsing produces
+//! identical results to the Rust core library. It ensures that users get
+//! consistent behavior whether using the CLI, SDK, or MCP interfaces.
+
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::config::OutputFormat;
+use serde_json::json;
+
+#[test]
+fn test_cli_config_json_flag_basic_parsing() {
+    let config_str = r#"{"use_cache": true, "output_format": "plain"}"#;
+
+    // Parse as Rust core would
+    let rust_config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize config string");
+
+    // Simulate CLI --config-json parsing (same as Rust core)
+    let cli_json: serde_json::Value = serde_json::from_str(config_str).expect("Failed to parse JSON string");
+    let cli_config: ExtractionConfig = serde_json::from_value(cli_json).expect("Failed to deserialize from JSON value");
+
+    // Verify identical behavior
+    assert_eq!(
+        rust_config.use_cache, cli_config.use_cache,
+        "use_cache should be identical"
+    );
+    assert_eq!(
+        rust_config.output_format, cli_config.output_format,
+        "output_format should be identical"
+    );
+}
+
+#[test]
+fn test_cli_nested_config_deserialization() {
+    let config_str = r#"{
+        "chunking": {
+            "max_characters": 1000,
+            "overlap": 200
+        },
+        "ocr": {
+            "backend": "tesseract"
+        }
+    }"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize nested config");
+
+    assert!(config.chunking.is_some(), "Chunking config should be present");
+    assert!(config.ocr.is_some(), "OCR config should be present");
+
+    let chunking = config.chunking.unwrap();
+    assert_eq!(chunking.max_characters, 1000, "max_chars should be 1000");
+    assert_eq!(chunking.overlap, 200, "max_overlap should be 200");
+
+    let ocr = config.ocr.unwrap();
+    assert_eq!(ocr.backend, "tesseract", "backend should be tesseract");
+}
+
+#[test]
+fn test_cli_force_ocr_flag_parsing() {
+    let config_str = r#"{"force_ocr": true}"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize force_ocr config");
+
+    assert!(config.force_ocr, "force_ocr should be true");
+    // Verify other fields retain defaults
+    assert!(config.use_cache, "use_cache should still be true by default");
+}
+
+#[test]
+fn test_cli_max_concurrent_extractions_parsing() {
+    let config_str = r#"{"max_concurrent_extractions": 8}"#;
+
+    let config: ExtractionConfig =
+        serde_json::from_str(config_str).expect("Failed to deserialize concurrent extractions");
+
+    assert_eq!(
+        config.max_concurrent_extractions,
+        Some(8),
+        "max_concurrent_extractions should be 8"
+    );
+}
+
+#[test]
+fn test_cli_complex_config_deserialization() {
+    let config_str = r#"{
+        "use_cache": false,
+        "enable_quality_processing": true,
+        "force_ocr": true,
+        "output_format": "markdown",
+        "result_format": "unified",
+        "max_concurrent_extractions": 16,
+        "ocr": {
+            "backend": "tesseract",
+            "language": "eng"
+        },
+        "chunking": {
+            "max_characters": 2000,
+            "overlap": 400,
+            "strategy": "sliding_window"
+        }
+    }"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize complex config");
+
+    // Verify all top-level fields
+    assert!(!config.use_cache);
+    assert!(config.enable_quality_processing);
+    assert!(config.force_ocr);
+    assert_eq!(config.max_concurrent_extractions, Some(16));
+
+    // Verify nested configs
+    assert!(config.ocr.is_some());
+    assert!(config.chunking.is_some());
+
+    let ocr = config.ocr.unwrap();
+    assert_eq!(ocr.backend, "tesseract");
+    assert_eq!(ocr.language, "eng");
+
+    let chunking = config.chunking.unwrap();
+    assert_eq!(chunking.max_characters, 2000);
+    assert_eq!(chunking.overlap, 400);
+}
+
+#[test]
+fn test_cli_empty_config_uses_defaults() {
+    let config_str = r#"{}"#;
+
+    let config: ExtractionConfig = serde_json::from_str(config_str).expect("Failed to deserialize empty config");
+
+    // All defaults should apply
+    assert!(config.use_cache, "Default use_cache should be true");
+    assert!(
+        config.enable_quality_processing,
+        "Default enable_quality_processing should be true"
+    );
+    assert!(!config.force_ocr, "Default force_ocr should be false");
+    assert_eq!(
+        config.max_concurrent_extractions, None,
+        "Default max_concurrent_extractions should be None"
+    );
+}
+
+#[test]
+fn test_cli_roundtrip_preserves_all_fields() {
+    let original_str = r#"{
+        "use_cache": false,
+        "force_ocr": true,
+        "max_concurrent_extractions": 12
+    }"#;
+
+    // Parse
+    let config: ExtractionConfig = serde_json::from_str(original_str).expect("Failed to deserialize");
+
+    // Serialize back
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+
+    // Re-parse the serialized version
+    let reparsed: ExtractionConfig =
+        serde_json::from_value(serialized).expect("Failed to deserialize roundtripped config");
+
+    // Verify fields preserved
+    assert!(!reparsed.use_cache);
+    assert!(reparsed.force_ocr);
+    assert_eq!(reparsed.max_concurrent_extractions, Some(12));
+}
+
+#[test]
+fn test_cli_output_format_enum_parsing() {
+    let test_cases = vec![
+        (r#"{"output_format": "plain"}"#, OutputFormat::Plain),
+        (r#"{"output_format": "markdown"}"#, OutputFormat::Markdown),
+        (r#"{"output_format": "html"}"#, OutputFormat::Html),
+    ];
+
+    for (config_str, expected_format) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to deserialize {}", config_str));
+
+        assert_eq!(
+            config.output_format, expected_format,
+            "output_format should match expected value"
+        );
+    }
+}
+
+#[test]
+fn test_cli_result_format_enum_parsing() {
+    let test_cases = vec![
+        r#"{"result_format": "unified"}"#,
+        r#"{"result_format": "element_based"}"#,
+    ];
+
+    for config_str in test_cases {
+        let result = serde_json::from_str::<ExtractionConfig>(config_str);
+        assert!(result.is_ok(), "Should deserialize result_format from {}", config_str);
+    }
+}
+
+#[test]
+fn test_cli_base64_encoded_config_simulation() {
+    // Simulate --config-json-base64 flag handling
+    let original_json = json!({
+        "force_ocr": true,
+        "output_format": "markdown"
+    });
+
+    let json_string = original_json.to_string();
+
+    // Simulate base64 encoding
+    let encoded = base64::engine::general_purpose::STANDARD.encode(&json_string);
+
+    // Simulate base64 decoding (as CLI would do)
+    use base64::Engine;
+    let decoded = String::from_utf8(
+        base64::engine::general_purpose::STANDARD
+            .decode(&encoded)
+            .expect("Failed to decode base64"),
+    )
+    .expect("Failed to convert bytes to string");
+
+    // Parse the decoded JSON
+    let config: ExtractionConfig = serde_json::from_str(&decoded).expect("Failed to deserialize base64-decoded config");
+
+    assert!(config.force_ocr);
+    assert_eq!(config.output_format, OutputFormat::Markdown);
+}
+
+#[test]
+fn test_cli_partial_override_merging() {
+    // Test that partial configs can override defaults
+    let base_config = ExtractionConfig::default();
+    let override_json = json!({"force_ocr": true, "use_cache": false});
+
+    // Simulate CLI merge: convert base to JSON, merge overrides, deserialize
+    let mut base_json = serde_json::to_value(&base_config).expect("Failed to serialize base config");
+
+    if let (serde_json::Value::Object(base_obj), serde_json::Value::Object(override_obj)) =
+        (&mut base_json, override_json)
+    {
+        for (key, value) in override_obj {
+            base_obj.insert(key, value);
+        }
+    }
+
+    let merged: ExtractionConfig = serde_json::from_value(base_json).expect("Failed to deserialize merged config");
+
+    assert!(merged.force_ocr, "Override should apply force_ocr");
+    assert!(!merged.use_cache, "Override should apply use_cache");
+    assert!(
+        merged.enable_quality_processing,
+        "Unoverridden field should retain default"
+    );
+}
+
+#[test]
+fn test_cli_invalid_json_error_handling() {
+    let invalid_json_str = r#"{"force_ocr": true, "invalid_field": "value"}"#;
+
+    // Note: serde with deny_unknown_fields would reject this
+    // Without that, it should deserialize successfully and ignore unknown fields
+    let result = serde_json::from_str::<ExtractionConfig>(invalid_json_str);
+
+    // Document the current behavior - unknown fields are typically ignored
+    if let Ok(config) = result {
+        assert!(config.force_ocr);
+    }
+}
+
+#[test]
+fn test_cli_whitespace_handling_in_json() {
+    let config_strs = vec![
+        r#"{"force_ocr":true}"#,     // No spaces
+        r#"{ "force_ocr" : true }"#, // Extra spaces
+        r#"{
+            "force_ocr": true
+        }"#, // Newlines and indentation
+    ];
+
+    for config_str in config_strs {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert!(config.force_ocr);
+    }
+}
+
+#[test]
+fn test_cli_numeric_boundary_values() {
+    // Test minimum and maximum reasonable values for numeric fields
+    let test_cases = vec![
+        (r#"{"max_concurrent_extractions": 1}"#, Some(1)),
+        (r#"{"max_concurrent_extractions": 256}"#, Some(256)),
+        (r#"{"max_concurrent_extractions": 0}"#, Some(0)), // Edge case: 0 extractions
+    ];
+
+    for (config_str, expected_value) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert_eq!(
+            config.max_concurrent_extractions, expected_value,
+            "Numeric values should be parsed correctly"
+        );
+    }
+}
+
+#[test]
+fn test_cli_boolean_values_strict_parsing() {
+    // Test that boolean values are strictly true/false, not truthy/falsy
+    let test_cases = vec![(r#"{"use_cache": true}"#, true), (r#"{"use_cache": false}"#, false)];
+
+    for (config_str, expected_value) in test_cases {
+        let config: ExtractionConfig =
+            serde_json::from_str(config_str).unwrap_or_else(|_| panic!("Failed to parse: {}", config_str));
+
+        assert_eq!(config.use_cache, expected_value);
+    }
+}
+
+#[test]
+fn test_cli_config_consistency_across_formats() {
+    // Create a config programmatically
+    let programmatic_config = ExtractionConfig {
+        use_cache: false,
+        enable_quality_processing: true,
+        force_ocr: true,
+        output_format: OutputFormat::Markdown,
+        max_concurrent_extractions: Some(4),
+        ..Default::default()
+    };
+
+    // Serialize it
+    let serialized_json = serde_json::to_value(&programmatic_config).expect("Failed to serialize");
+
+    // Deserialize back from JSON string (simulating CLI parsing)
+    let json_string = serialized_json.to_string();
+    let deserialized: ExtractionConfig = serde_json::from_str(&json_string).expect("Failed to deserialize from string");
+
+    // Verify complete roundtrip
+    assert_eq!(deserialized.use_cache, programmatic_config.use_cache);
+    assert_eq!(
+        deserialized.enable_quality_processing,
+        programmatic_config.enable_quality_processing
+    );
+    assert_eq!(deserialized.force_ocr, programmatic_config.force_ocr);
+    assert_eq!(deserialized.output_format, programmatic_config.output_format);
+    assert_eq!(
+        deserialized.max_concurrent_extractions,
+        programmatic_config.max_concurrent_extractions
+    );
+}
+
+// Re-export needed for base64 test (moved to end of file)
+
+// Re-export needed for base64 test (imported at top of file)
--- a/crates/kreuzberg-cli/tests/e2e_config_test.rs
+++ b/crates/kreuzberg-cli/tests/e2e_config_test.rs
@@ -0,0 +1,603 @@
+//! Comprehensive CLI end-to-end integration tests for configuration flags.
+//!
+//! This test suite validates the new configuration features including:
+//! - `--config-json` for inline JSON configuration
+//! - `--config-json-base64` for base64-encoded JSON configuration
+//! - `--output-format` flag with all variants (plain, markdown, djot, html)
+//! - Flag precedence (CLI args > JSON config > file > defaults)
+//! - Config merge scenarios and conflict detection
+//! - Error handling for invalid inputs
+//! - Real extraction with new formats
+
+#![allow(clippy::bool_assert_comparison)]
+
+use std::path::PathBuf;
+use std::process::Command;
+use tempfile::TempDir;
+
+/// Get the path to the kreuzberg binary.
+fn get_binary_path() -> String {
+    let manifest_dir = env!("CARGO_MANIFEST_DIR");
+    format!("{}/../../target/debug/kreuzberg", manifest_dir)
+}
+
+/// Get the test_documents directory path.
+fn get_test_documents_dir() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir.parent().unwrap().parent().unwrap().join("test_documents")
+}
+
+/// Get a test file path relative to test_documents/.
+fn get_test_file(relative_path: &str) -> String {
+    get_test_documents_dir()
+        .join(relative_path)
+        .to_string_lossy()
+        .to_string()
+}
+
+/// Build the binary before running tests (runs once per test).
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("Failed to build kreuzberg binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+}
+
+/// Helper to create a temporary config file with specified content.
+fn create_test_config(dir: &TempDir, name: &str, content: &str) -> PathBuf {
+    let config_path = dir.path().join(name);
+    std::fs::write(&config_path, content).expect("Failed to write config file");
+    config_path
+}
+
+/// Helper to encode string as base64.
+fn to_base64(input: &str) -> String {
+    // Manual base64 encoding
+    const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    let bytes = input.as_bytes();
+    let mut result = String::new();
+    let mut i = 0;
+
+    while i < bytes.len() {
+        let b1 = bytes[i];
+        let b2 = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
+        let b3 = if i + 2 < bytes.len() { bytes[i + 2] } else { 0 };
+
+        let n = ((b1 as u32) << 16) | ((b2 as u32) << 8) | (b3 as u32);
+
+        result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
+        result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
+
+        if i + 1 < bytes.len() {
+            result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
+        } else {
+            result.push('=');
+        }
+
+        if i + 2 < bytes.len() {
+            result.push(CHARSET[(n & 0x3F) as usize] as char);
+        } else {
+            result.push('=');
+        }
+
+        i += 3;
+    }
+
+    result
+}
+
+// ============================================================================
+// Test 1: --config-json inline flag with complex configuration
+// ============================================================================
+
+#[test]
+fn test_cli_config_json_inline() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json",
+            r#"{"use_cache": false, "chunking": {"max_chars": 512}}"#,
+        ])
+        .output()
+        .expect("Failed to execute extract command with --config-json");
+
+    assert!(
+        output.status.success(),
+        "Extract command with --config-json failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Output should not be empty");
+}
+
+// ============================================================================
+// Test 2: --config-json-base64 flag for base64-encoded configuration
+// ============================================================================
+
+#[test]
+fn test_cli_config_json_base64() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Encode JSON config as base64
+    let json_config = r#"{"use_cache": false}"#;
+    let base64_config = to_base64(json_config);
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json-base64",
+            base64_config.as_str(),
+        ])
+        .output()
+        .expect("Failed to execute extract command with --config-json-base64");
+
+    assert!(
+        output.status.success(),
+        "Extract command with --config-json-base64 failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(!stdout.is_empty(), "Output should not be empty");
+}
+
+// ============================================================================
+// Test 3: Flag precedence verification (CLI flags > JSON > file > defaults)
+// ============================================================================
+
+#[test]
+fn test_cli_flag_precedence() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+
+    // Create a config file with specific settings
+    let config_content = r#"
+use_cache = true
+
+[chunking]
+max_chars = 1024
+"#;
+    let config_path = create_test_config(&temp_dir, "config.toml", config_content);
+
+    // CLI flag should override config file setting
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"use_cache": false}"#,
+        ])
+        .output()
+        .expect("Failed to execute command with precedence test");
+
+    assert!(
+        output.status.success(),
+        "Precedence test command failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+// ============================================================================
+// Test 4: --output-format flag with all variants (plain, markdown, djot, html)
+// ============================================================================
+
+#[test]
+fn test_cli_output_format_all_variants() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let formats = vec!["plain", "markdown", "djot", "html"];
+
+    for format in formats {
+        let output = Command::new(get_binary_path())
+            .args(["extract", test_file.as_str(), "--output-format", format])
+            .output()
+            .unwrap_or_else(|_| panic!("Failed to execute extract with --output-format {}", format));
+
+        assert!(
+            output.status.success(),
+            "Extract command with --output-format {} failed: {}",
+            format,
+            String::from_utf8_lossy(&output.stderr)
+        );
+
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        assert!(!stdout.is_empty(), "Output for format {} should not be empty", format);
+    }
+}
+
+// ============================================================================
+// Test 5: Output formats (text vs json) for extraction result
+// ============================================================================
+
+#[test]
+fn test_cli_result_format() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Test text output format
+    let output_text = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "text"])
+        .output()
+        .expect("Failed to execute extract with --format text");
+
+    assert!(
+        output_text.status.success(),
+        "Text format output failed: {}",
+        String::from_utf8_lossy(&output_text.stderr)
+    );
+
+    let text_content = String::from_utf8_lossy(&output_text.stdout);
+    assert!(!text_content.is_empty(), "Text output should not be empty");
+
+    // Test JSON output format
+    let output_json = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--format", "json"])
+        .output()
+        .expect("Failed to execute extract with --format json");
+
+    assert!(
+        output_json.status.success(),
+        "JSON format output failed: {}",
+        String::from_utf8_lossy(&output_json.stderr)
+    );
+
+    let json_content = String::from_utf8_lossy(&output_json.stdout);
+    let parsed: Result<serde_json::Value, _> = serde_json::from_str(&json_content);
+    assert!(
+        parsed.is_ok(),
+        "JSON output should be valid JSON, got: {}",
+        json_content
+    );
+
+    // Verify JSON has expected envelope+result structure
+    if let Ok(value) = parsed {
+        assert!(
+            value.get("result").is_some(),
+            "JSON envelope should have 'result' field"
+        );
+        assert!(
+            value.get("extraction_time_ms").is_some(),
+            "JSON envelope should have 'extraction_time_ms' field"
+        );
+        assert!(
+            value["result"].get("content").is_some(),
+            "result should have 'content' field"
+        );
+        assert!(
+            value["result"].get("mime_type").is_some(),
+            "result should have 'mime_type' field"
+        );
+    }
+}
+
+// ============================================================================
+// Test 6: Deprecated --content-format flag warning
+// ============================================================================
+
+#[test]
+fn test_cli_content_format_deprecated_warning() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // The deprecated --content-format should still work but may show warning
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--content-format", "plain"])
+        .output()
+        .expect("Failed to execute extract with --content-format");
+
+    // Command should either succeed or show expected deprecation behavior
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Note: We're checking that the command doesn't crash; deprecation warning behavior
+    // depends on implementation details
+    assert!(
+        output.status.success() || !stdout.is_empty(),
+        "Command should succeed or produce output"
+    );
+}
+
+// ============================================================================
+// Test 7: Config merge scenarios - multiple configuration sources
+// ============================================================================
+
+#[test]
+fn test_cli_config_merge_scenarios() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+
+    // Create a base config file
+    let config_content = r#"
+use_cache = true
+
+[chunking]
+max_chars = 1024
+"#;
+    let config_path = create_test_config(&temp_dir, "base.toml", config_content);
+
+    // Merge: config file + inline JSON (JSON should override matching keys)
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"use_cache": false}"#,
+        ])
+        .output()
+        .expect("Failed to merge configs");
+
+    assert!(
+        output.status.success(),
+        "Config merge failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+}
+
+// ============================================================================
+// Test 8: Invalid JSON error handling
+// ============================================================================
+
+#[test]
+fn test_cli_invalid_json_error() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config-json",
+            r#"{"invalid json without closing"#, // Malformed JSON
+        ])
+        .output()
+        .expect("Failed to execute command");
+
+    // Should fail gracefully with error message
+    assert!(!output.status.success(), "Command should fail with invalid JSON");
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    // Should contain some error indication
+    assert!(
+        !stderr.is_empty() || !String::from_utf8_lossy(&output.stdout).is_empty(),
+        "Should provide feedback about invalid JSON"
+    );
+}
+
+// ============================================================================
+// Test 9: Config flag conflicts
+// ============================================================================
+
+#[test]
+fn test_cli_conflicts() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    let temp_dir = TempDir::new().expect("Failed to create temp directory");
+    let config_content = "use_cache = true\n";
+    let config_path = create_test_config(&temp_dir, "config.toml", config_content);
+
+    // Using both --config-json and --config-json-base64 might conflict
+    let json_config = r#"{"use_cache": false}"#;
+    let base64_config = to_base64(json_config);
+
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--config",
+            config_path.to_string_lossy().as_ref(),
+            "--config-json",
+            r#"{"chunking": {"max_chars": 512}}"#,
+            "--config-json-base64",
+            base64_config.as_str(),
+        ])
+        .output()
+        .expect("Failed to execute command with potential conflicts");
+
+    // The behavior here depends on implementation:
+    // Either it should succeed (last flag wins) or show an error (mutually exclusive)
+    // We verify that the command completes without crashing
+    let _ = output.status.success();
+}
+
+// ============================================================================
+// Test 10: Real end-to-end extraction with new config formats
+// ============================================================================
+
+#[test]
+fn test_cli_real_extraction() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Full E2E test: extract with multiple new flags
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--format",
+            "json",
+            "--output-format",
+            "markdown",
+            "--config-json",
+            r#"{"use_cache": false, "disable_ocr": true}"#,
+        ])
+        .output()
+        .expect("Failed to execute full E2E extraction");
+
+    assert!(
+        output.status.success(),
+        "E2E extraction failed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+
+    // Should be valid JSON output
+    let parsed: Result<serde_json::Value, _> = serde_json::from_str(&stdout);
+    assert!(parsed.is_ok(), "E2E output should be valid JSON, got: {}", stdout);
+
+    // Verify envelope+result structure
+    if let Ok(value) = parsed {
+        assert!(value.get("result").is_some(), "Missing 'result' envelope field");
+        assert!(
+            value.get("extraction_time_ms").is_some(),
+            "Missing 'extraction_time_ms' field"
+        );
+        assert!(
+            value["result"].get("content").is_some(),
+            "Missing content field in result"
+        );
+        assert!(
+            value["result"].get("mime_type").is_some(),
+            "Missing mime_type field in result"
+        );
+    }
+}
+
+// ============================================================================
+// Additional Edge Cases and Robustness Tests
+// ============================================================================
+
+#[test]
+fn test_cli_empty_config_json() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Empty JSON object should use defaults
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--config-json", "{}"])
+        .output()
+        .expect("Failed to execute with empty JSON config");
+
+    assert!(output.status.success(), "Command with empty JSON config should succeed");
+}
+
+#[test]
+fn test_cli_multiple_output_format_variants() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Test case-insensitive format argument
+    let output = Command::new(get_binary_path())
+        .args([
+            "extract",
+            test_file.as_str(),
+            "--output-format",
+            "MARKDOWN", // uppercase should work or fail predictably
+        ])
+        .output()
+        .expect("Failed to execute");
+
+    // Either succeeds with case-insensitive parsing or fails gracefully
+    let _ = output.status.success();
+}
+
+#[test]
+fn test_cli_config_json_with_nested_objects() {
+    build_binary();
+
+    let test_file = get_test_file("text/simple.txt");
+    if !PathBuf::from(&test_file).exists() {
+        eprintln!("Skipping test: {} not found", test_file);
+        return;
+    }
+
+    // Complex nested JSON configuration
+    let complex_config = r#"
+{
+    "use_cache": false,
+    "chunking": {"max_chars": 512},
+    "language_detection": {
+        "enabled": true,
+        "confidence_threshold": 0.8
+    }
+}
+"#;
+
+    let output = Command::new(get_binary_path())
+        .args(["extract", test_file.as_str(), "--config-json", complex_config])
+        .output()
+        .expect("Failed to execute with nested JSON config");
+
+    assert!(
+        output.status.success() || !String::from_utf8_lossy(&output.stderr).is_empty(),
+        "Complex config should either work or provide error"
+    );
+}
--- a/crates/kreuzberg-cli/tests/extract_envelope.rs
+++ b/crates/kreuzberg-cli/tests/extract_envelope.rs
@@ -0,0 +1,237 @@
+//! Integration tests for the JSON timing envelope added to `kreuzberg extract` and
+//! `kreuzberg batch`.
+//!
+//! Verifies:
+//!  - `extract --format json` emits `{ result, extraction_time_ms }` shape
+//!  - `batch --format json` emits `{ results, total_ms, per_file_ms }` shape
+//!  - `result.metadata.ocr_used` exists as a bool field
+//!  - `--pdf-backend xyz` exits non-zero and mentions "pdf-oxide"
+
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+/// Returns path to the compiled `kreuzberg` binary (debug build).
+fn kreuzberg_bin() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("target")
+        .join("debug")
+        .join("kreuzberg")
+}
+
+/// Returns path to the small reference PDF used in these tests.
+fn pdf_fixture() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("test_documents")
+        .join("pdf")
+        .join("pdfa_001.pdf")
+}
+
+/// Returns path to the small plain-text fixture used for batch tests.
+fn txt_fixture() -> PathBuf {
+    let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    manifest_dir
+        .parent()
+        .expect("crates/kreuzberg-cli parent")
+        .parent()
+        .expect("crates parent")
+        .join("test_documents")
+        .join("text")
+        .join("fake_text.txt")
+}
+
+/// Build the binary once before running. Panics on failure.
+fn build_binary() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg"])
+        .status()
+        .expect("cargo build invocation failed");
+    assert!(status.success(), "cargo build failed — binary unavailable");
+}
+
+/// Skip-guard: returns `true` when the fixture exists so the test can run.
+fn fixture_exists(path: &Path) -> bool {
+    path.exists() && path.is_file()
+}
+
+// ── extract --format json envelope ──────────────────────────────────────────
+
+#[test]
+fn test_extract_json_has_result_and_timing() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args(["extract", &pdf.to_string_lossy(), "--format", "json"])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        output.status.success(),
+        "extract exited non-zero: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+
+    // Envelope shape
+    assert!(json.get("result").is_some(), "missing 'result' key in envelope");
+    let extraction_time_ms = json
+        .get("extraction_time_ms")
+        .and_then(|v| v.as_f64())
+        .expect("'extraction_time_ms' must be a number");
+    assert!(
+        extraction_time_ms > 0.0,
+        "extraction_time_ms must be positive, got {extraction_time_ms}"
+    );
+
+    // ocr_used field must exist as a bool
+    let ocr_used = json["result"]["metadata"]
+        .get("ocr_used")
+        .expect("'result.metadata.ocr_used' must be present")
+        .as_bool()
+        .expect("'result.metadata.ocr_used' must be a boolean");
+    // For a native-text PDF without --force-ocr, OCR should NOT have run.
+    assert!(!ocr_used, "expected ocr_used=false for native PDF extraction");
+}
+
+// ── batch --format json envelope ─────────────────────────────────────────────
+
+#[test]
+fn test_batch_json_has_results_and_timing() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    let txt = txt_fixture();
+    if !fixture_exists(&pdf) || !fixture_exists(&txt) {
+        eprintln!("SKIP: one or more batch fixtures not found");
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args([
+            "batch",
+            &pdf.to_string_lossy(),
+            &txt.to_string_lossy(),
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("failed to run kreuzberg batch");
+
+    assert!(
+        output.status.success(),
+        "batch exited non-zero: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+
+    // Envelope shape
+    let results = json
+        .get("results")
+        .and_then(|v| v.as_array())
+        .expect("'results' must be an array");
+    assert_eq!(results.len(), 2, "expected 2 results for 2 input files");
+
+    let total_ms = json
+        .get("total_ms")
+        .and_then(|v| v.as_f64())
+        .expect("'total_ms' must be a number");
+    assert!(total_ms > 0.0, "total_ms must be positive, got {total_ms}");
+
+    let per_file_ms = json
+        .get("per_file_ms")
+        .and_then(|v| v.as_array())
+        .expect("'per_file_ms' must be an array");
+    assert_eq!(per_file_ms.len(), 2, "per_file_ms must have one entry per file");
+
+    for (i, timing) in per_file_ms.iter().enumerate() {
+        let ms = timing.as_f64().expect("per_file_ms entry must be a number");
+        assert!(ms > 0.0, "per_file_ms[{i}] must be positive, got {ms}");
+    }
+
+    // Each result must have metadata.ocr_used as a bool
+    for (i, result) in results.iter().enumerate() {
+        assert!(
+            result["metadata"].get("ocr_used").and_then(|v| v.as_bool()).is_some(),
+            "results[{i}].metadata.ocr_used must be a bool"
+        );
+    }
+}
+
+// ── --pdf-backend validation ─────────────────────────────────────────────────
+
+#[test]
+fn test_pdf_backend_invalid_value_exits_nonzero() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args(["extract", &pdf.to_string_lossy(), "--pdf-backend", "xyz"])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        !output.status.success(),
+        "expected non-zero exit for unknown --pdf-backend"
+    );
+
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    assert!(
+        stderr.contains("pdf-oxide"),
+        "error message should mention 'pdf-oxide', got: {stderr}"
+    );
+}
+
+#[test]
+fn test_pdf_backend_valid_value_succeeds() {
+    build_binary();
+
+    let pdf = pdf_fixture();
+    if !fixture_exists(&pdf) {
+        eprintln!("SKIP: PDF fixture not found at {}", pdf.display());
+        return;
+    }
+
+    let output = Command::new(kreuzberg_bin())
+        .args([
+            "extract",
+            &pdf.to_string_lossy(),
+            "--pdf-backend",
+            "pdf-oxide",
+            "--format",
+            "json",
+        ])
+        .output()
+        .expect("failed to run kreuzberg extract");
+
+    assert!(
+        output.status.success(),
+        "--pdf-backend pdf-oxide should succeed: {}",
+        String::from_utf8_lossy(&output.stderr)
+    );
+
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).expect("stdout is not valid JSON");
+    assert!(json.get("result").is_some(), "missing 'result' key");
+    assert!(json.get("extraction_time_ms").is_some(), "missing 'extraction_time_ms'");
+}
--- a/crates/kreuzberg-cli/tests/server_test.rs
+++ b/crates/kreuzberg-cli/tests/server_test.rs
@@ -0,0 +1,153 @@
+//! Integration tests for server commands (serve and mcp).
+
+#[cfg(not(coverage))]
+use std::process::{Command, Stdio};
+#[cfg(not(coverage))]
+use std::thread;
+#[cfg(not(coverage))]
+use std::time::Duration;
+
+#[cfg(not(coverage))]
+#[test]
+#[ignore]
+fn test_serve_command_starts() {
+    let status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(status.success(), "Failed to build kreuzberg binary");
+
+    let mut child = Command::new("./target/debug/kreuzberg")
+        .args(["serve", "-H", "127.0.0.1", "-p", "18000"])
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .expect("Failed to start server");
+
+    thread::sleep(Duration::from_secs(3));
+
+    let mut health_response = ureq::get("http://127.0.0.1:18000/health")
+        .call()
+        .expect("Failed to call health endpoint");
+
+    assert_eq!(health_response.status(), 200);
+
+    let health_json: serde_json::Value = health_response
+        .body_mut()
+        .read_json()
+        .expect("Failed to parse health response");
+
+    assert_eq!(health_json["status"], "healthy");
+    assert!(health_json["version"].is_string());
+
+    let mut info_response = ureq::get("http://127.0.0.1:18000/info")
+        .call()
+        .expect("Failed to call info endpoint");
+
+    assert_eq!(info_response.status(), 200);
+
+    let info_json: serde_json::Value = info_response
+        .body_mut()
+        .read_json()
+        .expect("Failed to parse info response");
+
+    assert!(info_json["rust_backend"].as_bool().unwrap_or(false));
+
+    child.kill().expect("Failed to kill server");
+    child.wait().expect("Failed to wait for server");
+}
+
+#[cfg(not(coverage))]
+#[test]
+#[ignore]
+fn test_serve_command_with_config() {
+    use std::fs;
+
+    let config_content = r#"
+use_cache = true
+enable_quality_processing = true
+
+[ocr]
+backend = "tesseract"
+language = "eng"
+"#;
+
+    fs::write("test_config.toml", config_content).expect("Failed to write test config");
+
+    let mut child = Command::new("./target/debug/kreuzberg")
+        .args(["serve", "-H", "127.0.0.1", "-p", "18001", "-c", "test_config.toml"])
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .expect("Failed to start server");
+
+    thread::sleep(Duration::from_secs(3));
+
+    let health_response = ureq::get("http://127.0.0.1:18001/health").call();
+
+    assert!(health_response.is_ok(), "Server should be running with custom config");
+
+    child.kill().expect("Failed to kill server");
+    child.wait().expect("Failed to wait for server");
+
+    fs::remove_file("test_config.toml").ok();
+}
+
+#[cfg(not(coverage))]
+#[test]
+fn test_serve_command_help() {
+    let build_status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(build_status.success(), "Failed to build kreuzberg binary");
+
+    let binary_path = env!("CARGO_TARGET_TMPDIR")
+        .split("target")
+        .next()
+        .map(|s| format!("{}target/debug/kreuzberg", s))
+        .unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
+
+    let output = Command::new(&binary_path)
+        .args(["serve", "--help"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Start the API server"));
+    assert!(stdout.contains("--host"));
+    assert!(stdout.contains("--port"));
+    assert!(stdout.contains("--config"));
+}
+
+#[cfg(not(coverage))]
+#[test]
+fn test_mcp_command_help() {
+    let build_status = Command::new("cargo")
+        .args(["build", "--bin", "kreuzberg", "--features", "all"])
+        .status()
+        .expect("Failed to build binary");
+
+    assert!(build_status.success(), "Failed to build kreuzberg binary");
+
+    let binary_path = env!("CARGO_TARGET_TMPDIR")
+        .split("target")
+        .next()
+        .map(|s| format!("{}target/debug/kreuzberg", s))
+        .unwrap_or_else(|| "../target/debug/kreuzberg".to_string());
+
+    let output = Command::new(&binary_path)
+        .args(["mcp", "--help"])
+        .output()
+        .expect("Failed to execute command");
+
+    assert!(output.status.success());
+
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    assert!(stdout.contains("Start the MCP (Model Context Protocol) server"));
+    assert!(stdout.contains("--config"));
+}