This commit is contained in:
338
e2e/go/contract_test.go
generated
Normal file
338
e2e/go/contract_test.go
generated
Normal file
@@ -0,0 +1,338 @@
|
||||
// This file is auto-generated by alef — DO NOT EDIT.
|
||||
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
// To regenerate: alef generate
|
||||
// To verify freshness: alef verify --exit-code
|
||||
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
|
||||
// E2e tests for category: contract
|
||||
package e2e_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
kreuzberg "github.com/kreuzberg-dev/kreuzberg/v5"
|
||||
)
|
||||
|
||||
func Test_ApiBatchBytesAsync(t *testing.T) {
|
||||
// Tests async batch bytes extraction API (batch_extract_bytes)
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.Content), `May 5, 2023`) { found = true }
|
||||
if strings.Contains(string(result.Content), `Mallori`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Test_ApiBatchBytesWithConfigsAsync(t *testing.T) {
|
||||
// Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
func Test_ApiBatchFileAsync(t *testing.T) {
|
||||
// Tests async batch file extraction API (batch_extract_file)
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.Content), `May 5, 2023`) { found = true }
|
||||
if strings.Contains(string(result.Content), `Mallori`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Test_ApiBatchFileWithConfigsAsync(t *testing.T) {
|
||||
// Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
func Test_ApiExtractBytesAsync(t *testing.T) {
|
||||
// Tests async bytes extraction API (extract_bytes)
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.Content), `May 5, 2023`) { found = true }
|
||||
if strings.Contains(string(result.Content), `Mallori`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Test_ApiExtractFileAsync(t *testing.T) {
|
||||
// Tests async file extraction API (extract_file)
|
||||
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{})
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.Content), `May 5, 2023`) { found = true }
|
||||
if strings.Contains(string(result.Content), `Mallori`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Test_ConfigChunkingPrependHeadingContext(t *testing.T) {
|
||||
// Tests markdown chunker prepends heading hierarchy to chunk content
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`markdown/extraction_test.md`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'chunks' not available on result type
|
||||
assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil { return false }; for _, c := range chunks { if c.Content == "" { return false } }; return true }(), "expected true")
|
||||
assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil { return false }; for _, c := range chunks { if c.Metadata.HeadingContext == nil { return false } }; return true }(), "expected true")
|
||||
assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil || len(chunks) == 0 { return false }; return chunks[0].Metadata.HeadingContext != nil }(), "expected true")
|
||||
}
|
||||
|
||||
func Test_ConfigDocumentStructureWithHeadings(t *testing.T) {
|
||||
// Tests document structure with DOCX heading-driven nesting
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"include_document_structure":true}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`docx/fake.docx`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.wordprocessingml.document` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
// skipped: field 'document' not available on result type
|
||||
// skipped: field 'document.nodes' not available on result type
|
||||
}
|
||||
|
||||
func Test_ConfigElementTypes(t *testing.T) {
|
||||
// Tests element-based result format with element type assertions on DOCX
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"result_format":"element_based"}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`docx/unit_test_headers.docx`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.MimeType), `application/vnd.openxmlformats-officedocument.wordprocessingml.document`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
// skipped: field 'elements' not available on result type
|
||||
}
|
||||
|
||||
func Test_ConfigExtractionTimeout(t *testing.T) {
|
||||
// Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"extraction_timeout_secs":300}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
}
|
||||
|
||||
func Test_ConfigKeywords(t *testing.T) {
|
||||
// Tests keyword extraction via YAKE algorithm
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"keywords":{"algorithm":"yake","max_keywords":10}}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'keywords' not available on Go ExtractionResult
|
||||
// skipped: field 'keywords' not available on Go ExtractionResult
|
||||
}
|
||||
|
||||
func Test_ConfigPages(t *testing.T) {
|
||||
// Tests page extraction and page marker configuration
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"pages":{"extract_pages":true,"insert_page_markers":true}}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.Content), `PAGE`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Test_ConfigQualityEnabled(t *testing.T) {
|
||||
// Tests quality scoring produces a score value in [0.0, 1.0]
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"enable_quality_processing":true}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
// skipped: field 'quality_score' not available on result type
|
||||
}
|
||||
|
||||
func Test_ConfigSecurityLimits(t *testing.T) {
|
||||
// Tests archive extraction with custom security limits
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`archives/documents.zip`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
{
|
||||
found := false
|
||||
if strings.Contains(string(result.MimeType), `application/zip`) { found = true }
|
||||
if strings.Contains(string(result.MimeType), `application/x-zip-compressed`) { found = true }
|
||||
if !found {
|
||||
t.Errorf("expected to contain at least one of the specified values")
|
||||
}
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
}
|
||||
|
||||
func Test_ConfigTreeSitter(t *testing.T) {
|
||||
// Tests tree-sitter configuration round-trip
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`code/hello.py`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `text/x-source-code` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5")
|
||||
}
|
||||
|
||||
func Test_OutputFormatBytesMarkdown(t *testing.T) {
|
||||
// Tests markdown output format via bytes extraction API
|
||||
contentBytes, contentBytesErr := os.ReadFile(`pdf/fake_memo.pdf`)
|
||||
if contentBytesErr != nil {
|
||||
t.Fatalf("read fixture pdf/fake_memo.pdf: %v", contentBytesErr)
|
||||
}
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractBytesSync(contentBytes, `application/pdf`, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
|
||||
func Test_OutputFormatMarkdown(t *testing.T) {
|
||||
// Tests Markdown output format
|
||||
var config kreuzberg.ExtractionConfig
|
||||
if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil {
|
||||
t.Fatalf("config parse failed: %v", err)
|
||||
}
|
||||
result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config)
|
||||
if err != nil {
|
||||
t.Fatalf("call failed: %v", err)
|
||||
}
|
||||
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
||||
t.Errorf("equals mismatch: got %v", result.MimeType)
|
||||
}
|
||||
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
||||
// skipped: field 'metadata.output_format' not available on result type
|
||||
}
|
||||
Reference in New Issue
Block a user