//! Regression tests for issue 1004: nested list content duplication. //! //! Two bugs tracked together: //! //! 1. html-to-markdown-rs emitted malformed/duplicated Markdown for nested //! `ul > li > ul > li > ol` HTML structures. //! Fixed in html-to-markdown-rs 3.5.0 (kreuzberg-dev/html-to-markdown#385). //! //! 2. The Markdown chunker panicked on the malformed Markdown from (1). //! The underflow originated in `sep.start - offset` in //! `text_splitter/splitter.rs`. //! //! Fix contract: //! - `chunk_text` with `ChunkerType::Markdown` MUST NOT PANIC on any input. //! - HTML extraction of deeply nested mixed lists MUST NOT duplicate content. #![cfg(feature = "chunking")] use kreuzberg::chunking::{ChunkerType, ChunkingConfig, chunk_text}; /// Malformed Markdown produced by html-to-markdown on a `ul > li > ul > li > ol` /// HTML snippet before the fix. Passing this to the chunker triggered an /// integer underflow → panic. const MALFORMED_NESTED_LIST_MD: &str = "\ Title * 1. Item 1 2. Item 2 3. Item 3 4. Item 4 5. Item 5 6. Item 6 7. Item 7 8. Item 8 9. Item 9 10. Item 10 11. Item 11 1. Item 1 2. Item 2 3. Item 3 Item 1 Item 2 Item 3 "; fn markdown_chunk_config() -> ChunkingConfig { ChunkingConfig { max_characters: 200, overlap: 0, chunker_type: ChunkerType::Markdown, ..Default::default() } } /// The chunker must return Ok or Err — never panic — on any input string. #[test] fn markdown_chunker_never_panics_on_malformed_nested_list() { let result = chunk_text(MALFORMED_NESTED_LIST_MD, &markdown_chunk_config(), None); // We don't assert the content — just that no panic occurred. // Both Ok and Err are acceptable; a panic is not. let _ = result; } /// Chunker must not panic on completely empty input. #[test] fn markdown_chunker_empty_input_returns_empty_chunks() { let result = chunk_text("", &markdown_chunk_config(), None); assert!(result.is_ok()); assert!(result.unwrap().chunks.is_empty()); } /// Chunker must not panic on single-character input; Ok or Err is acceptable. #[test] fn markdown_chunker_single_char_input() { let result = chunk_text("x", &markdown_chunk_config(), None); let _ = result; } /// Chunker must not panic on input consisting only of newlines; Ok or Err is acceptable. #[test] fn markdown_chunker_only_newlines() { let result = chunk_text("\n\n\n\n", &markdown_chunk_config(), None); let _ = result; } /// Chunker must not panic on a valid deeply-nested Markdown list (the /// well-formed counterpart of the malformed input above). #[test] fn markdown_chunker_valid_nested_list() { let valid_nested = "\ - outer 1 - mid 1 1. inner 1 2. inner 2 - mid 2 - outer 2 "; let result = chunk_text(valid_nested, &markdown_chunk_config(), None); assert!(result.is_ok(), "valid nested list must chunk without error"); } #[cfg(feature = "html")] mod html_extraction { use kreuzberg::chunking::{ChunkerType, ChunkingConfig}; use kreuzberg::core::config::{ExtractionConfig, OutputFormat}; use kreuzberg::core::extractor::extract_bytes_sync; fn config_with_markdown_chunking() -> ExtractionConfig { ExtractionConfig { output_format: OutputFormat::Markdown, chunking: Some(ChunkingConfig { max_characters: 300, chunker_type: ChunkerType::Markdown, ..Default::default() }), ..Default::default() } } /// Extracting the nested-list HTML must not panic and must not duplicate /// list item content in the extraction result. #[test] fn html_nested_list_extraction_no_panic() { let html = b"