crates/kreuzberg/tests/ocr_configuration.rs

//! OCR configuration integration tests.
//!
//! This module extensively tests Tesseract OCR configuration propagation
//! to ensure all settings are correctly passed through to the OCR engine.
//!
//! Test philosophy:
//! - Verify all TesseractConfig fields are propagated correctly
//! - Test different language settings with appropriate test files
//! - Test PSM (page segmentation mode) variations
//! - Test force_ocr mode
//! - Verify configuration changes actually affect output
//! - Test table detection with various settings

#![cfg(feature = "ocr")]

mod helpers;

use helpers::*;
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
use kreuzberg::extract_file_sync;
use kreuzberg::types::TesseractConfig;

#[test]
fn test_ocr_language_english() {
    if skip_if_missing("images/test_hello_world.png") {
        return;
    }

    let file_path = get_test_file_path("images/test_hello_world.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with English OCR");

    assert_mime_type(&result, "image/png");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_ocr_language_german() {
    if skip_if_missing("images/test_hello_world.png") {
        return;
    }

    let file_path = get_test_file_path("images/test_hello_world.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "deu".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config);

    match result {
        Ok(extraction_result) => {
            assert_mime_type(&extraction_result, "image/png");

            assert!(
                extraction_result.chunks.is_none(),
                "Chunks should be None without chunking config"
            );
            assert!(
                extraction_result.detected_languages.is_none(),
                "Language detection not enabled"
            );
        }
        Err(e) => {
            tracing::debug!("German OCR failed (language pack may not be installed): {}", e);
        }
    }
}

#[test]
fn test_ocr_language_multiple() {
    if skip_if_missing("images/english_and_korean.png") {
        return;
    }

    let file_path = get_test_file_path("images/english_and_korean.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng+kor".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config);

    match result {
        Ok(extraction_result) => {
            assert_mime_type(&extraction_result, "image/png");

            assert!(
                extraction_result.chunks.is_none(),
                "Chunks should be None without chunking config"
            );
            assert!(
                extraction_result.detected_languages.is_none(),
                "Language detection not enabled"
            );
        }
        Err(e) => {
            tracing::debug!("Multi-language OCR failed (language pack may not be installed): {}", e);
        }
    }
}

#[test]
fn test_ocr_psm_auto() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 3 (auto)");

    assert_mime_type(&result, "image/jpeg");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_ocr_psm_single_block() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 6,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 6 (single block)");

    assert_mime_type(&result, "image/jpeg");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_ocr_psm_single_line() {
    if skip_if_missing("images/test_hello_world.png") {
        return;
    }

    let file_path = get_test_file_path("images/test_hello_world.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 7,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 7 (single line)");

    assert_mime_type(&result, "image/png");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
#[cfg(feature = "pdf")]
fn test_force_ocr_on_text_pdf() {
    if skip_if_missing("pdf/fake_memo.pdf") {
        return;
    }

    let file_path = get_test_file_path("pdf/fake_memo.pdf");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: true,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with force_ocr enabled");

    assert_mime_type(&result, "application/pdf");
    assert_non_empty_content(&result);

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");

    #[cfg(feature = "pdf")]
    assert!(result.metadata.format.is_some(), "PDF should have metadata");
}

#[test]
#[cfg(feature = "pdf")]
fn test_force_ocr_disabled() {
    if skip_if_missing("pdf/fake_memo.pdf") {
        return;
    }

    let file_path = get_test_file_path("pdf/fake_memo.pdf");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract without forcing OCR");

    assert_mime_type(&result, "application/pdf");
    assert_non_empty_content(&result);

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");

    #[cfg(feature = "pdf")]
    assert!(result.metadata.format.is_some(), "PDF should have metadata");
}

#[test]
fn test_table_detection_enabled() {
    if skip_if_missing("images/simple_table.png") {
        return;
    }

    let file_path = get_test_file_path("images/simple_table.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                enable_table_detection: true,
                table_min_confidence: 0.5,
                table_column_threshold: 10,
                table_row_threshold_ratio: 0.5,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection enabled");

    assert_mime_type(&result, "image/png");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_table_detection_disabled() {
    if skip_if_missing("images/simple_table.png") {
        return;
    }

    let file_path = get_test_file_path("images/simple_table.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                enable_table_detection: false,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection disabled");

    assert_mime_type(&result, "image/png");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_language_model_ngram_configuration() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                language_model_ngram_on: true,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result =
        extract_file_sync(&file_path, None, &config).expect("Should extract with ngram language model enabled");

    assert_mime_type(&result, "image/jpeg");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_dictionary_correction_enabled() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                tessedit_enable_dict_correction: true,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result =
        extract_file_sync(&file_path, None, &config).expect("Should extract with dictionary correction enabled");

    assert_mime_type(&result, "image/jpeg");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_character_whitelist() {
    if skip_if_missing("images/test_hello_world.png") {
        return;
    }

    let file_path = get_test_file_path("images/test_hello_world.png");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with character whitelist");

    assert_mime_type(&result, "image/png");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_ocr_cache_enabled() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                use_cache: true,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        use_cache: true,
        ..Default::default()
    };

    let result1 = extract_file_sync(&file_path, None, &config).expect("First extraction should succeed");
    let result2 = extract_file_sync(&file_path, None, &config).expect("Second extraction should succeed (cached)");

    assert_mime_type(&result1, "image/jpeg");
    assert_mime_type(&result2, "image/jpeg");

    assert!(
        result1.chunks.is_none(),
        "Chunks should be None without chunking config"
    );
    assert!(result1.detected_languages.is_none(), "Language detection not enabled");
    assert!(
        result2.chunks.is_none(),
        "Chunks should be None without chunking config"
    );
    assert!(result2.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_ocr_cache_disabled() {
    if skip_if_missing("images/ocr_image.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/ocr_image.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                use_cache: false,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        use_cache: false,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract without caching");

    assert_mime_type(&result, "image/jpeg");

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}

#[test]
fn test_complex_configuration_combination() {
    if skip_if_missing("images/layout_parser_ocr.jpg") {
        return;
    }

    let file_path = get_test_file_path("images/layout_parser_ocr.jpg");
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                enable_table_detection: true,
                table_min_confidence: 0.7,
                language_model_ngram_on: true,
                tessedit_enable_dict_correction: true,
                use_cache: true,
                ..Default::default()
            }),
            ..Default::default()
        }),
        force_ocr: false,
        use_cache: true,
        ..Default::default()
    };

    let result = extract_file_sync(&file_path, None, &config).expect("Should extract with complex configuration");

    assert_mime_type(&result, "image/jpeg");
    assert_non_empty_content(&result);

    assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
    assert!(result.detected_languages.is_none(), "Language detection not enabled");
}
Nomad changes 2026-06-01 23:40:55 +02:00			`//! OCR configuration integration tests.`
			`//!`
			`//! This module extensively tests Tesseract OCR configuration propagation`
			`//! to ensure all settings are correctly passed through to the OCR engine.`
			`//!`
			`//! Test philosophy:`
			`//! - Verify all TesseractConfig fields are propagated correctly`
			`//! - Test different language settings with appropriate test files`
			`//! - Test PSM (page segmentation mode) variations`
			`//! - Test force_ocr mode`
			`//! - Verify configuration changes actually affect output`
			`//! - Test table detection with various settings`

			`#![cfg(feature = "ocr")]`

			`mod helpers;`

			`use helpers::*;`
			`use kreuzberg::core::config::{ExtractionConfig, OcrConfig};`
			`use kreuzberg::extract_file_sync;`
			`use kreuzberg::types::TesseractConfig;`

			`#[test]`
			`fn test_ocr_language_english() {`
			`if skip_if_missing("images/test_hello_world.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/test_hello_world.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with English OCR");`

			`assert_mime_type(&result, "image/png");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_ocr_language_german() {`
			`if skip_if_missing("images/test_hello_world.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/test_hello_world.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "deu".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config);`

			`match result {`
			`Ok(extraction_result) => {`
			`assert_mime_type(&extraction_result, "image/png");`

			`assert!(`
			`extraction_result.chunks.is_none(),`
			`"Chunks should be None without chunking config"`
			`);`
			`assert!(`
			`extraction_result.detected_languages.is_none(),`
			`"Language detection not enabled"`
			`);`
			`}`
			`Err(e) => {`
			`tracing::debug!("German OCR failed (language pack may not be installed): {}", e);`
			`}`
			`}`
			`}`

			`#[test]`
			`fn test_ocr_language_multiple() {`
			`if skip_if_missing("images/english_and_korean.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/english_and_korean.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng+kor".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config);`

			`match result {`
			`Ok(extraction_result) => {`
			`assert_mime_type(&extraction_result, "image/png");`

			`assert!(`
			`extraction_result.chunks.is_none(),`
			`"Chunks should be None without chunking config"`
			`);`
			`assert!(`
			`extraction_result.detected_languages.is_none(),`
			`"Language detection not enabled"`
			`);`
			`}`
			`Err(e) => {`
			`tracing::debug!("Multi-language OCR failed (language pack may not be installed): {}", e);`
			`}`
			`}`
			`}`

			`#[test]`
			`fn test_ocr_psm_auto() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`psm: 3,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 3 (auto)");`

			`assert_mime_type(&result, "image/jpeg");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_ocr_psm_single_block() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`psm: 6,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 6 (single block)");`

			`assert_mime_type(&result, "image/jpeg");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_ocr_psm_single_line() {`
			`if skip_if_missing("images/test_hello_world.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/test_hello_world.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`psm: 7,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 7 (single line)");`

			`assert_mime_type(&result, "image/png");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`#[cfg(feature = "pdf")]`
			`fn test_force_ocr_on_text_pdf() {`
			`if skip_if_missing("pdf/fake_memo.pdf") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("pdf/fake_memo.pdf");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: true,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with force_ocr enabled");`

			`assert_mime_type(&result, "application/pdf");`
			`assert_non_empty_content(&result);`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`

			`#[cfg(feature = "pdf")]`
			`assert!(result.metadata.format.is_some(), "PDF should have metadata");`
			`}`

			`#[test]`
			`#[cfg(feature = "pdf")]`
			`fn test_force_ocr_disabled() {`
			`if skip_if_missing("pdf/fake_memo.pdf") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("pdf/fake_memo.pdf");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract without forcing OCR");`

			`assert_mime_type(&result, "application/pdf");`
			`assert_non_empty_content(&result);`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`

			`#[cfg(feature = "pdf")]`
			`assert!(result.metadata.format.is_some(), "PDF should have metadata");`
			`}`

			`#[test]`
			`fn test_table_detection_enabled() {`
			`if skip_if_missing("images/simple_table.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/simple_table.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`enable_table_detection: true,`
			`table_min_confidence: 0.5,`
			`table_column_threshold: 10,`
			`table_row_threshold_ratio: 0.5,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection enabled");`

			`assert_mime_type(&result, "image/png");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_table_detection_disabled() {`
			`if skip_if_missing("images/simple_table.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/simple_table.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`enable_table_detection: false,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection disabled");`

			`assert_mime_type(&result, "image/png");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_language_model_ngram_configuration() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`language_model_ngram_on: true,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result =`
			`extract_file_sync(&file_path, None, &config).expect("Should extract with ngram language model enabled");`

			`assert_mime_type(&result, "image/jpeg");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_dictionary_correction_enabled() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`tessedit_enable_dict_correction: true,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result =`
			`extract_file_sync(&file_path, None, &config).expect("Should extract with dictionary correction enabled");`

			`assert_mime_type(&result, "image/jpeg");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_character_whitelist() {`
			`if skip_if_missing("images/test_hello_world.png") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/test_hello_world.png");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with character whitelist");`

			`assert_mime_type(&result, "image/png");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_ocr_cache_enabled() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`use_cache: true,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`let result1 = extract_file_sync(&file_path, None, &config).expect("First extraction should succeed");`
			`let result2 = extract_file_sync(&file_path, None, &config).expect("Second extraction should succeed (cached)");`

			`assert_mime_type(&result1, "image/jpeg");`
			`assert_mime_type(&result2, "image/jpeg");`

			`assert!(`
			`result1.chunks.is_none(),`
			`"Chunks should be None without chunking config"`
			`);`
			`assert!(result1.detected_languages.is_none(), "Language detection not enabled");`
			`assert!(`
			`result2.chunks.is_none(),`
			`"Chunks should be None without chunking config"`
			`);`
			`assert!(result2.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_ocr_cache_disabled() {`
			`if skip_if_missing("images/ocr_image.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/ocr_image.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`use_cache: false,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`use_cache: false,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract without caching");`

			`assert_mime_type(&result, "image/jpeg");`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`

			`#[test]`
			`fn test_complex_configuration_combination() {`
			`if skip_if_missing("images/layout_parser_ocr.jpg") {`
			`return;`
			`}`

			`let file_path = get_test_file_path("images/layout_parser_ocr.jpg");`
			`let config = ExtractionConfig {`
			`ocr: Some(OcrConfig {`
			`backend: "tesseract".to_string(),`
			`language: "eng".to_string(),`
			`tesseract_config: Some(TesseractConfig {`
			`psm: 3,`
			`enable_table_detection: true,`
			`table_min_confidence: 0.7,`
			`language_model_ngram_on: true,`
			`tessedit_enable_dict_correction: true,`
			`use_cache: true,`
			`..Default::default()`
			`}),`
			`..Default::default()`
			`}),`
			`force_ocr: false,`
			`use_cache: true,`
			`..Default::default()`
			`};`

			`let result = extract_file_sync(&file_path, None, &config).expect("Should extract with complex configuration");`

			`assert_mime_type(&result, "image/jpeg");`
			`assert_non_empty_content(&result);`

			`assert!(result.chunks.is_none(), "Chunks should be None without chunking config");`
			`assert!(result.detected_languages.is_none(), "Language detection not enabled");`
			`}`