// This file is auto-generated by alef — DO NOT EDIT. // alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75 // To regenerate: alef generate // To verify freshness: alef verify --exit-code // Issues & docs: https://github.com/kreuzberg-dev/alef // E2e tests for category: smoke package e2e_test import ( "encoding/json" "os" "strings" "testing" "github.com/stretchr/testify/assert" kreuzberg "github.com/kreuzberg-dev/kreuzberg/v5" ) func Test_OcrImagePng(t *testing.T) { // OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge. contentBytes, contentBytesErr := os.ReadFile(`images/test_hello_world.png`) if contentBytesErr != nil { t.Fatalf("read fixture images/test_hello_world.png: %v", contentBytesErr) } result, err := kreuzberg.ExtractBytes(contentBytes, `image/png`, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `image/png` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 1, "expected length >= 1") { found := false if strings.Contains(string(result.Content), `Hello`) { found = true } if strings.Contains(string(result.Content), `World`) { found = true } if strings.Contains(string(result.Content), `hello`) { found = true } if strings.Contains(string(result.Content), `world`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_SmokeDocxBasic(t *testing.T) { // Smoke test: DOCX with formatted text mime_typeVal := `application/vnd.openxmlformats-officedocument.wordprocessingml.document` result, err := kreuzberg.ExtractFile(`docx/fake.docx`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.wordprocessingml.document` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 20, "expected length >= 20") { found := false if strings.Contains(string(result.Content), `Lorem`) { found = true } if strings.Contains(string(result.Content), `ipsum`) { found = true } if strings.Contains(string(result.Content), `document`) { found = true } if strings.Contains(string(result.Content), `text`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_SmokeHtmlBasic(t *testing.T) { // Smoke test: HTML table extraction mime_typeVal := `text/html` result, err := kreuzberg.ExtractFile(`html/simple_table.html`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `text/html` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10") { found := false if strings.Contains(string(result.Content), `Sample Data Table`) { found = true } if strings.Contains(string(result.Content), `Laptop`) { found = true } if strings.Contains(string(result.Content), `Electronics`) { found = true } if strings.Contains(string(result.Content), `Product`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_SmokeImagePng(t *testing.T) { // Smoke test: PNG image (without OCR, metadata only) var config kreuzberg.ExtractionConfig if err := json.Unmarshal([]byte(`{"disable_ocr":true}`), &config); err != nil { t.Fatalf("config parse failed: %v", err) } result, err := kreuzberg.ExtractFile(`images/sample.png`, nil, config) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `image/png` { t.Errorf("equals mismatch: got %v", result.MimeType) } } func Test_SmokeJsonBasic(t *testing.T) { // Smoke test: JSON file extraction mime_typeVal := `application/json` result, err := kreuzberg.ExtractFile(`json/simple.json`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/json` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5") } func Test_SmokePdfBasic(t *testing.T) { // Smoke test: PDF with simple text extraction mime_typeVal := `application/pdf` result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/pdf` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 50, "expected length >= 50") { found := false if strings.Contains(string(result.Content), `May 5, 2023`) { found = true } if strings.Contains(string(result.Content), `To Whom it May Concern`) { found = true } if !found { t.Errorf("expected to contain at least one of the specified values") } } } func Test_SmokeTxtBasic(t *testing.T) { // Smoke test: Plain text file mime_typeVal := `text/plain` result, err := kreuzberg.ExtractFile(`text/report.txt`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `text/plain` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5") } func Test_SmokeXlsxBasic(t *testing.T) { // Smoke test: XLSX with basic spreadsheet data including tables mime_typeVal := `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` result, err := kreuzberg.ExtractFile(`xlsx/stanley_cups.xlsx`, &mime_typeVal, kreuzberg.ExtractionConfig{}) if err != nil { t.Fatalf("call failed: %v", err) } if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` { t.Errorf("equals mismatch: got %v", result.MimeType) } assert.GreaterOrEqual(t, len(result.Content), 100, "expected length >= 100") if !strings.Contains(string(result.Content), `Team`) { t.Errorf("expected to contain %s", `Team`) } if !strings.Contains(string(result.Content), `Location`) { t.Errorf("expected to contain %s", `Location`) } if !strings.Contains(string(result.Content), `Stanley Cups`) { t.Errorf("expected to contain %s", `Stanley Cups`) } if !strings.Contains(string(result.Content), `Blues`) { t.Errorf("expected to contain %s", `Blues`) } if !strings.Contains(string(result.Content), `Flyers`) { t.Errorf("expected to contain %s", `Flyers`) } if !strings.Contains(string(result.Content), `Maple Leafs`) { t.Errorf("expected to contain %s", `Maple Leafs`) } if !strings.Contains(string(result.Content), `STL`) { t.Errorf("expected to contain %s", `STL`) } if !strings.Contains(string(result.Content), `PHI`) { t.Errorf("expected to contain %s", `PHI`) } if !strings.Contains(string(result.Content), `TOR`) { t.Errorf("expected to contain %s", `TOR`) } // skipped: field 'tables' not available on result type // skipped: field 'metadata.format.excel.sheet_count' not available on result type // skipped: field 'metadata.format.excel.sheet_names' not available on result type }