// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef using System; using System.Collections.Generic; using System.Linq; using System.Net.Http; using System.Text; using System.Text.Json; using System.Text.Json.Serialization; using System.Threading.Tasks; using Xunit; using Kreuzberg; using static Kreuzberg.KreuzbergLib; namespace Kreuzberg { /// E2e tests for category: smoke. public class SmokeTests { private static readonly JsonSerializerOptions ConfigOptions = new() { Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) }, DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault }; [Fact] public async Task Test_OcrImagePng() { // OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge. var result = await KreuzbergLib.ExtractBytesAsync(System.IO.File.ReadAllBytes("images/test_hello_world.png"), "image/png", ExtractionConfig.FromJson("{}")); Assert.Equal("image/png", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 1, "expected length >= 1"); Assert.True(result.Content.ToString().Contains("Hello") || result.Content.ToString().Contains("World") || result.Content.ToString().Contains("hello") || result.Content.ToString().Contains("world"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_SmokeDocxBasic() { // Smoke test: DOCX with formatted text var result = await KreuzbergLib.ExtractFileAsync("docx/fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", new ExtractionConfig()); Assert.Equal("application/vnd.openxmlformats-officedocument.wordprocessingml.document", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 20, "expected length >= 20"); Assert.True(result.Content.ToString().Contains("Lorem") || result.Content.ToString().Contains("ipsum") || result.Content.ToString().Contains("document") || result.Content.ToString().Contains("text"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_SmokeHtmlBasic() { // Smoke test: HTML table extraction var result = await KreuzbergLib.ExtractFileAsync("html/simple_table.html", "text/html", new ExtractionConfig()); Assert.Equal("text/html", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 10, "expected length >= 10"); Assert.True(result.Content.ToString().Contains("Sample Data Table") || result.Content.ToString().Contains("Laptop") || result.Content.ToString().Contains("Electronics") || result.Content.ToString().Contains("Product"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_SmokeImagePng() { // Smoke test: PNG image (without OCR, metadata only) var result = await KreuzbergLib.ExtractFileAsync("images/sample.png", null, new ExtractionConfig { DisableOcr = true }); Assert.Equal("image/png", result.MimeType!.Trim()); } [Fact] public async Task Test_SmokeJsonBasic() { // Smoke test: JSON file extraction var result = await KreuzbergLib.ExtractFileAsync("json/simple.json", "application/json", new ExtractionConfig()); Assert.Equal("application/json", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 5, "expected length >= 5"); } [Fact] public async Task Test_SmokePdfBasic() { // Smoke test: PDF with simple text extraction var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", "application/pdf", new ExtractionConfig()); Assert.Equal("application/pdf", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 50, "expected length >= 50"); Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("To Whom it May Concern"), "expected to contain at least one of the specified values"); } [Fact] public async Task Test_SmokeTxtBasic() { // Smoke test: Plain text file var result = await KreuzbergLib.ExtractFileAsync("text/report.txt", "text/plain", new ExtractionConfig()); Assert.Equal("text/plain", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 5, "expected length >= 5"); } [Fact] public async Task Test_SmokeXlsxBasic() { // Smoke test: XLSX with basic spreadsheet data including tables var result = await KreuzbergLib.ExtractFileAsync("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", new ExtractionConfig()); Assert.Equal("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", result.MimeType!.Trim()); Assert.True(result.Content.Length >= 100, "expected length >= 100"); Assert.Contains("team", result.Content.ToString().ToLower()); Assert.Contains("location", result.Content.ToString().ToLower()); Assert.Contains("stanley cups", result.Content.ToString().ToLower()); Assert.Contains("blues", result.Content.ToString().ToLower()); Assert.Contains("flyers", result.Content.ToString().ToLower()); Assert.Contains("maple leafs", result.Content.ToString().ToLower()); Assert.Contains("stl", result.Content.ToString().ToLower()); Assert.Contains("phi", result.Content.ToString().ToLower()); Assert.Contains("tor", result.Content.ToString().ToLower()); // skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type } } }