88 lines
5.5 KiB
TypeScript
Generated
88 lines
5.5 KiB
TypeScript
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
|
|
import { describe, expect, it } from 'vitest';import { extractFile, extractBytes, type ExtractionConfig } from '@kreuzberg/node';
|
|
function _alefE2eText(value: unknown): string {
|
|
return value == null ? "" : String(value);
|
|
}
|
|
|
|
function _alefE2eItemTexts(item: unknown): string[] {
|
|
if (item == null || typeof item !== "object") {
|
|
return [_alefE2eText(item)];
|
|
}
|
|
const record = item as Record<string, unknown>;
|
|
const itemsText = Array.isArray(record.items) ? record.items.map(_alefE2eText).join(" ") : "";
|
|
return [_alefE2eText(item), _alefE2eText(record.kind), _alefE2eText(record.name), _alefE2eText(record.source), _alefE2eText(record.alias), _alefE2eText(record.text), _alefE2eText(record.signature), itemsText];
|
|
}
|
|
|
|
function _alefE2eFormatMetadataDisplay(fm: unknown): string {
|
|
if (fm == null) return "";
|
|
if (typeof fm !== "object") return String(fm);
|
|
const record = fm as Record<string, unknown>;
|
|
const formatType = record.format_type;
|
|
|
|
// FormatMetadata is a tagged union: { format_type: 'image', image: { format: 'PNG', ... }, ... }
|
|
// Extract the display string based on the variant type
|
|
if (formatType === "image" && typeof record.image === "object") {
|
|
const imageData = record.image as Record<string, unknown>;
|
|
if (typeof imageData.format === "string") return imageData.format;
|
|
}
|
|
|
|
// Fallback: return format_type variant name
|
|
if (typeof record.format_type === "string") return record.format_type;
|
|
return "";
|
|
}
|
|
|
|
|
|
describe('smoke', () => { it('ocr_image_png: OCR: PNG image extraction with OCR enabled. In WASM this exercises the Uint8Array bridge parameter and Promise await in the generated OcrBackend bridge.', async () => { const _content_content = await (await import('node:fs/promises')).readFile('images/test_hello_world.png'); const result = await extractBytes(_content_content, "image/png", undefined); expect(result.mimeType.trim()).toBe("image/png");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(1);
|
|
expect(["Hello", "World", "hello", "world"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('smoke_docx_basic: Smoke test: DOCX with formatted text', async () => { const result = await extractFile("docx/fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", undefined); expect(result.mimeType.trim()).toBe("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(20);
|
|
expect(["Lorem", "ipsum", "document", "text"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('smoke_html_basic: Smoke test: HTML table extraction', async () => { const result = await extractFile("html/simple_table.html", "text/html", undefined); expect(result.mimeType.trim()).toBe("text/html");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["Sample Data Table", "Laptop", "Electronics", "Product"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('smoke_image_png: Smoke test: PNG image (without OCR, metadata only)', async () => { const result = await extractFile("images/sample.png", undefined, { disableOcr: true } as ExtractionConfig); expect(result.mimeType.trim()).toBe("image/png");
|
|
|
|
}, 30000);
|
|
it('smoke_json_basic: Smoke test: JSON file extraction', async () => { const result = await extractFile("json/simple.json", "application/json", undefined); expect(result.mimeType.trim()).toBe("application/json");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(5);
|
|
|
|
}, 30000);
|
|
it('smoke_pdf_basic: Smoke test: PDF with simple text extraction', async () => { const result = await extractFile("pdf/fake_memo.pdf", "application/pdf", undefined); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(50);
|
|
expect(["May 5, 2023", "To Whom it May Concern"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('smoke_txt_basic: Smoke test: Plain text file', async () => { const result = await extractFile("text/report.txt", "text/plain", undefined); expect(result.mimeType.trim()).toBe("text/plain");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(5);
|
|
|
|
}, 30000);
|
|
it('smoke_xlsx_basic: Smoke test: XLSX with basic spreadsheet data including tables', async () => { const result = await extractFile("xlsx/stanley_cups.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", undefined); expect(result.mimeType.trim()).toBe("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(100);
|
|
expect(result.content).toContain("Team");
|
|
expect(result.content).toContain("Location");
|
|
expect(result.content).toContain("Stanley Cups");
|
|
expect(result.content).toContain("Blues");
|
|
expect(result.content).toContain("Flyers");
|
|
expect(result.content).toContain("Maple Leafs");
|
|
expect(result.content).toContain("STL");
|
|
expect(result.content).toContain("PHI");
|
|
expect(result.content).toContain("TOR");
|
|
// skipped: field 'tables' not available on result type
|
|
// skipped: field 'metadata.format.excel.sheet_count' not available on result type
|
|
// skipped: field 'metadata.format.excel.sheet_names' not available on result type
|
|
|
|
}, 30000);
|
|
});
|