204 lines
11 KiB
C#
Generated
204 lines
11 KiB
C#
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net.Http;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using System.Text.Json.Serialization;
|
|
using System.Threading.Tasks;
|
|
using Xunit;
|
|
using Kreuzberg;
|
|
using static Kreuzberg.KreuzbergLib;
|
|
|
|
namespace Kreuzberg
|
|
{
|
|
/// <summary>E2e tests for category: contract.</summary>
|
|
public class ContractTests
|
|
{
|
|
private static readonly JsonSerializerOptions ConfigOptions = new() { Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) }, DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault };
|
|
|
|
[Fact]
|
|
public async Task Test_ApiBatchBytesAsync()
|
|
{
|
|
// Tests async batch bytes extraction API (batch_extract_bytes)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null);
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Test_ApiBatchBytesWithConfigsAsync()
|
|
{
|
|
// Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, new ExtractionConfig { OutputFormat = OutputFormat.Markdown });
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Test_ApiBatchFileAsync()
|
|
{
|
|
// Tests async batch file extraction API (batch_extract_file)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null);
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Test_ApiBatchFileWithConfigsAsync()
|
|
{
|
|
// Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, new ExtractionConfig { OutputFormat = OutputFormat.Markdown });
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Test_ApiExtractBytesAsync()
|
|
{
|
|
// Tests async bytes extraction API (extract_bytes)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null);
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public async Task Test_ApiExtractFileAsync()
|
|
{
|
|
// Tests async file extraction API (extract_file)
|
|
var result = await KreuzbergLib.ExtractFileAsync("pdf/fake_memo.pdf", null, null);
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
Assert.True(result.Content.ToString().Contains("May 5, 2023") || result.Content.ToString().Contains("Mallori"), "expected to contain at least one of the specified values");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigChunkingPrependHeadingContext()
|
|
{
|
|
// Tests markdown chunker prepends heading hierarchy to chunk content
|
|
var result = KreuzbergLib.ExtractFileSync("markdown/extraction_test.md", null, ExtractionConfig.FromJson("{\"chunking\":{\"chunker_type\":\"markdown\",\"max_chars\":300,\"max_overlap\":50,\"prepend_heading_context\":true}}"));
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'chunks' not available on result typeAssert.True((result.Chunks ?? new()).All(c => !string.IsNullOrEmpty(c.Content)));
|
|
Assert.True((result.Chunks ?? new()).All(c => c.Metadata?.HeadingContext != null));
|
|
Assert.True((result.Chunks ?? new()).FirstOrDefault()?.Metadata?.HeadingContext != null);
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigDocumentStructureWithHeadings()
|
|
{
|
|
// Tests document structure with DOCX heading-driven nesting
|
|
var result = KreuzbergLib.ExtractFileSync("docx/fake.docx", null, ExtractionConfig.FromJson("{\"include_document_structure\":true}"));
|
|
Assert.Equal("application/vnd.openxmlformats-officedocument.wordprocessingml.document", result.MimeType!.Trim());
|
|
// skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigElementTypes()
|
|
{
|
|
// Tests element-based result format with element type assertions on DOCX
|
|
var result = KreuzbergLib.ExtractFileSync("docx/unit_test_headers.docx", null, ExtractionConfig.FromJson("{\"result_format\":\"element_based\"}"));
|
|
Assert.True(result.MimeType.ToString().Contains("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), "expected to contain at least one of the specified values");
|
|
// skipped: field 'elements' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigExtractionTimeout()
|
|
{
|
|
// Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions
|
|
var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"extraction_timeout_secs\":300}"));
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigKeywords()
|
|
{
|
|
// Tests keyword extraction via YAKE algorithm
|
|
var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"keywords\":{\"algorithm\":\"yake\",\"max_keywords\":10}}"));
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'keywords' not available on C# ExtractionResult // skipped: field 'keywords' not available on C# ExtractionResult
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigPages()
|
|
{
|
|
// Tests page extraction and page marker configuration
|
|
var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"pages\":{\"extract_pages\":true,\"insert_page_markers\":true}}"));
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
Assert.True(result.Content.ToString().Contains("PAGE"), "expected to contain at least one of the specified values");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigQualityEnabled()
|
|
{
|
|
// Tests quality scoring produces a score value in [0.0, 1.0]
|
|
var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"enable_quality_processing\":true}"));
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigSecurityLimits()
|
|
{
|
|
// Tests archive extraction with custom security limits
|
|
var result = KreuzbergLib.ExtractFileSync("archives/documents.zip", null, ExtractionConfig.FromJson("{\"security_limits\":{\"max_archive_size\":104857600,\"max_compression_ratio\":50,\"max_files_in_archive\":100}}"));
|
|
Assert.True(result.MimeType.ToString().Contains("application/zip") || result.MimeType.ToString().Contains("application/x-zip-compressed"), "expected to contain at least one of the specified values");
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_ConfigTreeSitter()
|
|
{
|
|
// Tests tree-sitter configuration round-trip
|
|
var result = KreuzbergLib.ExtractFileSync("code/hello.py", null, ExtractionConfig.FromJson("{\"tree_sitter\":{\"groups\":[\"web\"],\"languages\":[\"python\",\"rust\"],\"process\":{\"comments\":false,\"diagnostics\":false,\"docstrings\":false,\"exports\":true,\"imports\":true,\"structure\":true,\"symbols\":false}}}"));
|
|
Assert.Equal("text/x-source-code", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 5, "expected length >= 5");
|
|
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_OutputFormatBytesMarkdown()
|
|
{
|
|
// Tests markdown output format via bytes extraction API
|
|
var result = KreuzbergLib.ExtractBytesSync(System.IO.File.ReadAllBytes("pdf/fake_memo.pdf"), "application/pdf", new ExtractionConfig { OutputFormat = OutputFormat.Markdown });
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
}
|
|
|
|
[Fact]
|
|
public void Test_OutputFormatMarkdown()
|
|
{
|
|
// Tests Markdown output format
|
|
var result = KreuzbergLib.ExtractFileSync("pdf/fake_memo.pdf", null, ExtractionConfig.FromJson("{\"output_format\":\"markdown\"}"));
|
|
Assert.Equal("application/pdf", result.MimeType!.Trim());
|
|
Assert.True(result.Content.Length >= 10, "expected length >= 10");
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
}
|
|
|
|
|
|
}
|
|
}
|