// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef using System; using System.Collections.Generic; using System.Linq; using System.Net.Http; using System.Text; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading.Tasks; using Xunit; using Kreuzberg; using static Kreuzberg.KreuzbergLib; namespace Kreuzberg { /// E2e tests for category: contract. public class ContractTests { private static readonly JsonSerializerOptions ConfigOptions = new() { Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) }, DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault }; [Fact] public async Task Test_ApiBatchBytesAsync() { // Tests async batch bytes extraction API (batch_extract_bytes) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_ApiBatchBytesWithConfigsAsync() { // Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, new ExtractionConfig { OutputFormat = OutputFormat.Markdown }); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'metadata.output_format' not available on result type } [Fact] public async Task Test_ApiBatchFileAsync() { // Tests async batch file extraction API (batch_extract_file) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_ApiBatchFileWithConfigsAsync() { // Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, new ExtractionConfig { OutputFormat = OutputFormat.Markdown }); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'metadata.output_format' not available on result type } [Fact] public async Task Test_ApiExtractBytesAsync() { // Tests async bytes extraction API (extract_bytes) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_ApiExtractFileAsync() { // Tests async file extraction API (extract_file) var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values"); } [Fact] public void Test_ConfigChunkingPrependHeadingContext() { // Tests markdown chunker prepends heading hierarchy to chunk content var result = KreuzbergLib.ExtractFileSync("markdown/extraction_test.md", null, ExtractionConfig.FromJson("{\"chunking\":{\"chunker_type\":\"markdown\",\"max_chars\":300,\"max_overlap\":50,\"prepend_heading_context\":true}}")); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'chunks' not available on result typeAssert.True((result.Chunks ?? new()).All(c => !string.IsNullOrEmpty(c.Content))); Assert.True((result.Chunks ?? new()).All(c => c.Metadata?.HeadingContext != null)); Assert.True((result.Chunks ?? new()).FirstOrDefault()?.Metadata?.HeadingContext != null); } [Fact] public void Test_ConfigDocumentStructureWithHeadings() { // Tests document structure with DOCX heading-driven nesting var result = KreuzbergLib.ExtractFileSync("docx/fake.docx", null, ExtractionConfig.FromJson("{\"include_document_structure\":true}")); Assert.Equal("application/vnd.openxmlformats-officedocument.wordprocessingml.document", result.MimeType!.Trim()); // skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type } [Fact] public void Test_ConfigElementTypes() { // Tests element-based result format with element type assertions on DOCX var result = KreuzbergLib.ExtractFileSync("docx/unit_test_headers.docx", null, ExtractionConfig.FromJson("{\"result_format\":\"element_based\"}")); Assert.True(result.MimeType.ToString().Contains("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "expected to contain at least one of the specified values"); // skipped: field 'elements' not available on result type } [Fact] public void Test_ConfigExtractionTimeout() { // Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"extraction_timeout_secs\":300}")); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); } [Fact] public void Test_ConfigKeywords() { // Tests keyword extraction via YAKE algorithm var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"keywords\":{\"algorithm\":\"yake\",\"max_keywords\":10}}")); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'keywords' not available on C# ExtractionResult // skipped: field 'keywords' not available on C# ExtractionResult } [Fact] public void Test_ConfigPages() { // Tests page extraction and page marker configuration var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"pages\":{\"extract_pages\":true,\"insert_page_markers\":true}}")); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("PAGE"), "expected to contain at least one of the specified values"); } [Fact] public void Test_ConfigQualityEnabled() { // Tests quality scoring produces a score value in [0.0, 1.0] var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"enable_quality_processing\":true}")); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type } [Fact] public void Test_ConfigSecurityLimits() { // Tests archive extraction with custom security limits var result = KreuzbergLib.ExtractFileSync("archives/documents.zip", null, ExtractionConfig.FromJson("{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}")); Assert.True(result.MimeType.ToString().Contains("application/zip") || result.MimeType.ToString().Contains("application/x-zip-compressed"), "expected to contain at least one of the specified values"); Assert.True(result.Content.Length >= 10, "expected length >= 10"); } [Fact] public void Test_ConfigTreeSitter() { // Tests tree-sitter configuration round-trip var result = KreuzbergLib.ExtractFileSync("code/hello.py", null, ExtractionConfig.FromJson("{\"tree_sitter\":{\"groups\":[\"web\"],\"languages\":[\"python\",\"rust\"],\"process\":{\"comments\":false,\"diagnostics\":false,\"docstrings\":false,\"exports\":true,\"imports\":true,\"structure\":true,\"symbols\":false}}}")); Assert.Equal("text/x-source-code", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 5, "expected length >= 5"); } [Fact] public void Test_OutputFormatBytesMarkdown() { // Tests markdown output format via bytes extraction API var result = KreuzbergLib.ExtractBytesSync(System.IO.File.ReadAllBytes("pdf/fake_memo.pdf"), "application/pdf", new ExtractionConfig { OutputFormat = OutputFormat.Markdown }); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'metadata.output_format' not available on result type } [Fact] public void Test_OutputFormatMarkdown() { // Tests Markdown output format var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"output_format\":\"markdown\"}")); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); // skipped: field 'metadata.output_format' not available on result type } } }