// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef // E2e tests for category: contract package e2e_test import ( "encoding/json" "os" "strings" "testing" "github.com/stretchr/testify/assert" kreuzberg "github.com/kreuzberg-dev/kreuzberg/v5" ) func Test_ApiBatchBytesAsync(t *testing.T) { // Tests async batch bytes extraction API (batch_extract_bytes) result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `May 5, 2023`) { found = true } if strings.Contains(string(result.Content), `Mallori`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_ApiBatchBytesWithConfigsAsync(t *testing.T) { // Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter) var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'metadata.output_format' not available on result type } func Test_ApiBatchFileAsync(t *testing.T) { // Tests async batch file extraction API (batch_extract_file) result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `May 5, 2023`) { found = true } if strings.Contains(string(result.Content), `Mallori`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_ApiBatchFileWithConfigsAsync(t *testing.T) { // Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter) var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'metadata.output_format' not available on result type } func Test_ApiExtractBytesAsync(t *testing.T) { // Tests async bytes extraction API (extract_bytes) result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `May 5, 2023`) { found = true } if strings.Contains(string(result.Content), `Mallori`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_ApiExtractFileAsync(t *testing.T) { // Tests async file extraction API (extract_file) result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, nil, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `May 5, 2023`) { found = true } if strings.Contains(string(result.Content), `Mallori`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_ConfigChunkingPrependHeadingContext(t *testing.T) { // Tests markdown chunker prepends heading hierarchy to chunk content var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"chunking":{"chunker_type":"markdown","max_chars":300,"max_overlap":50,"prepend_heading_context":true}}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`markdown/extraction_test.md`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'chunks' not available on result type assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil { return false }; for _, c := range chunks { if c.Content == "" { return false } }; return true }(), "expected true") assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil { return false }; for _, c := range chunks { if c.Metadata.HeadingContext == nil { return false } }; return true }(), "expected true") assert.True(t, func() bool { chunks := result.Chunks; if chunks == nil || len(chunks) == 0 { return false }; return chunks[0].Metadata.HeadingContext != nil }(), "expected true") } func Test_ConfigDocumentStructureWithHeadings(t *testing.T) { // Tests document structure with DOCX heading-driven nesting var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"include_document_structure":true}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`docx/fake.docx`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.wordprocessingml.document` { t.Errorf("equals mismatch: got %v", result.MimeType) } // skipped: field 'document' not available on result type // skipped: field 'document.nodes' not available on result type } func Test_ConfigElementTypes(t *testing.T) { // Tests element-based result format with element type assertions on DOCX var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"result_format":"element_based"}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`docx/unit_test_headers.docx`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } { found := false if strings.Contains(string(result.MimeType), `application/vnd.openxmlformats-officedocument.wordprocessingml.document`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } // skipped: field 'elements' not available on result type } func Test_ConfigExtractionTimeout(t *testing.T) { // Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"extraction_timeout_secs":300}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") } func Test_ConfigKeywords(t *testing.T) { // Tests keyword extraction via YAKE algorithm var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"keywords":{"algorithm":"yake","max_keywords":10}}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'keywords' not available on Go ExtractionResult // skipped: field 'keywords' not available on Go ExtractionResult } func Test_ConfigPages(t *testing.T) { // Tests page extraction and page marker configuration var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"pages":{"extract_pages":true,"insert_page_markers":true}}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `PAGE`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_ConfigQualityEnabled(t *testing.T) { // Tests quality scoring produces a score value in [0.0, 1.0] var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"enable_quality_processing":true}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type // skipped: field 'quality_score' not available on result type } func Test_ConfigSecurityLimits(t *testing.T) { // Tests archive extraction with custom security limits var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"security_limits":{"max_archive_size":104857600,"max_compression_ratio":50,"max_files_in_archive":100}}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`archives/documents.zip`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } { found := false if strings.Contains(string(result.MimeType), `application/zip`) { found = true } if strings.Contains(string(result.MimeType), `application/x-zip-compressed`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") } func Test_ConfigTreeSitter(t *testing.T) { // Tests tree-sitter configuration round-trip var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"tree_sitter":{"groups":["web"],"languages":["python","rust"],"process":{"comments":false,"diagnostics":false,"docstrings":false,"exports":true,"imports":true,"structure":true,"symbols":false}}}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`code/hello.py`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `text/x-source-code` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5") } func Test_OutputFormatBytesMarkdown(t *testing.T) { // Tests markdown output format via bytes extraction API contentBytes, contentBytesErr := os.ReadFile(`pdf/fake_memo.pdf`) if contentBytesErr != nil { t.Fatalf("read fixture pdf/fake_memo.pdf: %v", contentBytesErr) } var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractBytesSync(contentBytes, `application/pdf`, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'metadata.output_format' not available on result type } func Test_OutputFormatMarkdown(t *testing.T) { // Tests Markdown output format var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"output_format":"markdown"}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFileSync(`pdf/fake_memo.pdf`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") // skipped: field 'metadata.output_format' not available on result type }