121 lines
6.1 KiB
C#
121 lines
6.1 KiB
C#
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
|
||
|
|
using System;
|
||
|
|
using System.Collections.Generic;
|
||
|
|
using System.Linq;
|
||
|
|
using System.Net.Http;
|
||
|
|
using System.Text;
|
||
|
|
using System.Text.Json;
|
||
|
|
using System.Text.Json.Serialization;
|
||
|
|
using System.Threading.Tasks;
|
||
|
|
using Xunit;
|
||
|
|
using Kreuzberg;
|
||
|
|
using static Kreuzberg.KreuzbergLib;
|
||
|
|
|
||
|
|
namespace Kreuzberg
|
||
|
|
{
|
||
|
|
/// <summary>E2e tests for category: smoke.</summary>
|
||
|
|
public class SmokeTests
|
||
|
|
{
|
||
|
|
private static readonly JsonSerializerOptions ConfigOptions = new() { Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) }, DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault };
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_OcrImagePng()
|
||
|
|
{
|
||
|
|
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
|
||
|
|
var result = await KreuzbergLib.ExtractBytesAsync(System.IO.File.ReadAllBytes("images/test_hello_world.png"), "image/png", ExtractionConfig.FromJson("{}"));
|
||
|
|
Assert.Equal("image/png", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 1, "expected length >= 1");
|
||
|
|
Assert.True(result.Content.ToString().Contains("Hello") || result.Content.ToString().Contains("World") || result.Content.ToString().Contains("hello") || result.Content.ToString().Contains("world"), "expected to contain at least one of the specified values");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeDocxBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: DOCX with formatted text
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("docx/fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", new ExtractionConfig());
|
||
|
|
Assert.Equal("application/vnd.openxmlformats-officedocument.wordprocessingml.document", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 20, "expected length >= 20");
|
||
|
|
Assert.True(result.Content.ToString().Contains("Lorem") || result.Content.ToString().Contains("ipsum") || result.Content.ToString().Contains("document") || result.Content.ToString().Contains("text"), "expected to contain at least one of the specified values");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeHtmlBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: HTML table extraction
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("html/simple_table.html", "text/html", new ExtractionConfig());
|
||
|
|
Assert.Equal("text/html", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
||
|
|
Assert.True(result.Content.ToString().Contains("Sample Data Table") || result.Content.ToString().Contains("Laptop") || result.Content.ToString().Contains("Electronics") || result.Content.ToString().Contains("Product"), "expected to contain at least one of the specified values");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeImagePng()
|
||
|
|
{
|
||
|
|
// Smoke test: PNG image (without OCR, metadata only)
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("images/sample.png", null, new ExtractionConfig { DisableOcr = true });
|
||
|
|
Assert.Equal("image/png", result.MimeType!.Trim());
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeJsonBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: JSON file extraction
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("json/simple.json", "application/json", new ExtractionConfig());
|
||
|
|
Assert.Equal("application/json", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 5, "expected length >= 5");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokePdfBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: PDF with simple text extraction
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", "application/pdf", new ExtractionConfig());
|
||
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 50, "expected length >= 50");
|
||
|
|
Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("To Whom it May Concern"), "expected to contain at least one of the specified values");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeTxtBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: Plain text file
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("text/report.txt", "text/plain", new ExtractionConfig());
|
||
|
|
Assert.Equal("text/plain", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 5, "expected length >= 5");
|
||
|
|
|
||
|
|
}
|
||
|
|
|
||
|
|
[Fact]
|
||
|
|
public async Task Test_SmokeXlsxBasic()
|
||
|
|
{
|
||
|
|
// Smoke test: XLSX with basic spreadsheet data including tables
|
||
|
|
var result = await KreuzbergLib.ExtractFileAsync("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", new ExtractionConfig());
|
||
|
|
Assert.Equal("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", result.MimeType!.Trim());
|
||
|
|
Assert.True(result.Content.Length >= 100, "expected length >= 100");
|
||
|
|
Assert.Contains("team", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("location", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("stanley cups", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("blues", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("flyers", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("maple leafs", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("stl", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("phi", result.Content.ToString().ToLower());
|
||
|
|
Assert.Contains("tor", result.Content.ToString().ToLower());
|
||
|
|
// skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
}
|