197 lines
7.3 KiB
Go
Generated
197 lines
7.3 KiB
Go
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
|
|
// E2e tests for category: smoke
|
|
package e2e_test
|
|
|
|
import (
|
|
"encoding/json"
|
|
"os"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
kreuzberg "github.com/kreuzberg-dev/kreuzberg/v5"
|
|
)
|
|
|
|
func Test_OcrImagePng(t *testing.T) {
|
|
// OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.
|
|
contentBytes, contentBytesErr := os.ReadFile(`images/test_hello_world.png`)
|
|
if contentBytesErr != nil {
|
|
t.Fatalf("read fixture images/test_hello_world.png: %v", contentBytesErr)
|
|
}
|
|
result, err := kreuzberg.ExtractBytes(contentBytes, `image/png`, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `image/png` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 1, "expected length >= 1")
|
|
{
|
|
found := false
|
|
if strings.Contains(string(result.Content), `Hello`) { found = true }
|
|
if strings.Contains(string(result.Content), `World`) { found = true }
|
|
if strings.Contains(string(result.Content), `hello`) { found = true }
|
|
if strings.Contains(string(result.Content), `world`) { found = true }
|
|
if !found {
|
|
t.Errorf("expected to contain at least one of the specified values")
|
|
}
|
|
}
|
|
}
|
|
|
|
func Test_SmokeDocxBasic(t *testing.T) {
|
|
// Smoke test: DOCX with formatted text
|
|
mime_typeVal := `application/vnd.openxmlformats-officedocument.wordprocessingml.document`
|
|
result, err := kreuzberg.ExtractFile(`docx/fake.docx`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.wordprocessingml.document` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 20, "expected length >= 20")
|
|
{
|
|
found := false
|
|
if strings.Contains(string(result.Content), `Lorem`) { found = true }
|
|
if strings.Contains(string(result.Content), `ipsum`) { found = true }
|
|
if strings.Contains(string(result.Content), `document`) { found = true }
|
|
if strings.Contains(string(result.Content), `text`) { found = true }
|
|
if !found {
|
|
t.Errorf("expected to contain at least one of the specified values")
|
|
}
|
|
}
|
|
}
|
|
|
|
func Test_SmokeHtmlBasic(t *testing.T) {
|
|
// Smoke test: HTML table extraction
|
|
mime_typeVal := `text/html`
|
|
result, err := kreuzberg.ExtractFile(`html/simple_table.html`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `text/html` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 10, "expected length >= 10")
|
|
{
|
|
found := false
|
|
if strings.Contains(string(result.Content), `Sample Data Table`) { found = true }
|
|
if strings.Contains(string(result.Content), `Laptop`) { found = true }
|
|
if strings.Contains(string(result.Content), `Electronics`) { found = true }
|
|
if strings.Contains(string(result.Content), `Product`) { found = true }
|
|
if !found {
|
|
t.Errorf("expected to contain at least one of the specified values")
|
|
}
|
|
}
|
|
}
|
|
|
|
func Test_SmokeImagePng(t *testing.T) {
|
|
// Smoke test: PNG image (without OCR, metadata only)
|
|
var config kreuzberg.ExtractionConfig
|
|
if err := json.Unmarshal([]byte(`{"disable_ocr":true}`), &config); err != nil {
|
|
t.Fatalf("config parse failed: %v", err)
|
|
}
|
|
result, err := kreuzberg.ExtractFile(`images/sample.png`, nil, config)
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `image/png` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
}
|
|
|
|
func Test_SmokeJsonBasic(t *testing.T) {
|
|
// Smoke test: JSON file extraction
|
|
mime_typeVal := `application/json`
|
|
result, err := kreuzberg.ExtractFile(`json/simple.json`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `application/json` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5")
|
|
}
|
|
|
|
func Test_SmokePdfBasic(t *testing.T) {
|
|
// Smoke test: PDF with simple text extraction
|
|
mime_typeVal := `application/pdf`
|
|
result, err := kreuzberg.ExtractFile(`pdf/fake_memo.pdf`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `application/pdf` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 50, "expected length >= 50")
|
|
{
|
|
found := false
|
|
if strings.Contains(string(result.Content), `May 5, 2023`) { found = true }
|
|
if strings.Contains(string(result.Content), `To Whom it May Concern`) { found = true }
|
|
if !found {
|
|
t.Errorf("expected to contain at least one of the specified values")
|
|
}
|
|
}
|
|
}
|
|
|
|
func Test_SmokeTxtBasic(t *testing.T) {
|
|
// Smoke test: Plain text file
|
|
mime_typeVal := `text/plain`
|
|
result, err := kreuzberg.ExtractFile(`text/report.txt`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `text/plain` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 5, "expected length >= 5")
|
|
}
|
|
|
|
func Test_SmokeXlsxBasic(t *testing.T) {
|
|
// Smoke test: XLSX with basic spreadsheet data including tables
|
|
mime_typeVal := `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet`
|
|
result, err := kreuzberg.ExtractFile(`xlsx/stanley_cups.xlsx`, &mime_typeVal, kreuzberg.ExtractionConfig{})
|
|
if err != nil {
|
|
t.Fatalf("call failed: %v", err)
|
|
}
|
|
if strings.TrimSpace(string(result.MimeType)) != `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` {
|
|
t.Errorf("equals mismatch: got %v", result.MimeType)
|
|
}
|
|
assert.GreaterOrEqual(t, len(result.Content), 100, "expected length >= 100")
|
|
if !strings.Contains(string(result.Content), `Team`) {
|
|
t.Errorf("expected to contain %s", `Team`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `Location`) {
|
|
t.Errorf("expected to contain %s", `Location`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `Stanley Cups`) {
|
|
t.Errorf("expected to contain %s", `Stanley Cups`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `Blues`) {
|
|
t.Errorf("expected to contain %s", `Blues`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `Flyers`) {
|
|
t.Errorf("expected to contain %s", `Flyers`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `Maple Leafs`) {
|
|
t.Errorf("expected to contain %s", `Maple Leafs`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `STL`) {
|
|
t.Errorf("expected to contain %s", `STL`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `PHI`) {
|
|
t.Errorf("expected to contain %s", `PHI`)
|
|
}
|
|
if !strings.Contains(string(result.Content), `TOR`) {
|
|
t.Errorf("expected to contain %s", `TOR`)
|
|
}
|
|
// skipped: field 'tables' not available on result type
|
|
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
|
|
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
|
}
|