124 lines
9.7 KiB
TypeScript
Generated
124 lines
9.7 KiB
TypeScript
Generated
// This file is auto-generated by alef — DO NOT EDIT.
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
|
// To regenerate: alef generate
|
|
// To verify freshness: alef verify --exit-code
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
|
|
import { describe, expect, it } from 'vitest';import { extractFile, extractFileSync, extractBytesSync, type ExtractionConfig } from '@kreuzberg/node';
|
|
function _alefE2eText(value: unknown): string {
|
|
return value == null ? "" : String(value);
|
|
}
|
|
|
|
function _alefE2eItemTexts(item: unknown): string[] {
|
|
if (item == null || typeof item !== "object") {
|
|
return [_alefE2eText(item)];
|
|
}
|
|
const record = item as Record<string, unknown>;
|
|
const itemsText = Array.isArray(record.items) ? record.items.map(_alefE2eText).join(" ") : "";
|
|
return [_alefE2eText(item), _alefE2eText(record.kind), _alefE2eText(record.name), _alefE2eText(record.source), _alefE2eText(record.alias), _alefE2eText(record.text), _alefE2eText(record.signature), itemsText];
|
|
}
|
|
|
|
function _alefE2eFormatMetadataDisplay(fm: unknown): string {
|
|
if (fm == null) return "";
|
|
if (typeof fm !== "object") return String(fm);
|
|
const record = fm as Record<string, unknown>;
|
|
const formatType = record.format_type;
|
|
|
|
// FormatMetadata is a tagged union: { format_type: 'image', image: { format: 'PNG', ... }, ... }
|
|
// Extract the display string based on the variant type
|
|
if (formatType === "image" && typeof record.image === "object") {
|
|
const imageData = record.image as Record<string, unknown>;
|
|
if (typeof imageData.format === "string") return imageData.format;
|
|
}
|
|
|
|
// Fallback: return format_type variant name
|
|
if (typeof record.format_type === "string") return record.format_type;
|
|
return "";
|
|
}
|
|
|
|
|
|
describe('contract', () => { it('api_batch_bytes_async: Tests async batch bytes extraction API (batch_extract_bytes)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, undefined); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["May 5, 2023", "Mallori"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('api_batch_bytes_with_configs_async: Tests async batch bytes extraction with per-file configs (batch_extract_bytes with file_configs parameter)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, { outputFormat: "markdown" } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
|
|
}, 30000);
|
|
it('api_batch_file_async: Tests async batch file extraction API (batch_extract_file)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, undefined); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["May 5, 2023", "Mallori"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('api_batch_file_with_configs_async: Tests async batch file extraction with per-file configs (batch_extract_files with file_configs parameter)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, { outputFormat: "markdown" } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
|
|
}, 30000);
|
|
it('api_extract_bytes_async: Tests async bytes extraction API (extract_bytes)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, undefined); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["May 5, 2023", "Mallori"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('api_extract_file_async: Tests async file extraction API (extract_file)', async () => { const result = await extractFile("pdf/fake_memo.pdf", undefined, undefined); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["May 5, 2023", "Mallori"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('config_chunking_prepend_heading_context: Tests markdown chunker prepends heading hierarchy to chunk content', () => { const result = extractFileSync("markdown/extraction_test.md", undefined, { chunking: { chunkerType: "markdown", maxChars: 300, maxOverlap: 50, prependHeadingContext: true } } as ExtractionConfig); expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'chunks' not available on result type
|
|
expect((result.chunks ?? []).every((c: { content?: string }) => !!c.content)).toBe(true); expect((result.chunks ?? []).every((c: { metadata?: { headingContext?: string } }) => c.metadata?.headingContext != null)).toBe(true); expect((result.chunks ?? []).at(0)?.metadata?.headingContext != null).toBe(true);
|
|
}, 30000);
|
|
it('config_document_structure_with_headings: Tests document structure with DOCX heading-driven nesting', () => { const result = extractFileSync("docx/fake.docx", undefined, { includeDocumentStructure: true } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|
// skipped: field 'document' not available on result type
|
|
// skipped: field 'document.nodes' not available on result type
|
|
|
|
}, 30000);
|
|
it('config_element_types: Tests element-based result format with element type assertions on DOCX', () => { const result = extractFileSync("docx/unit_test_headers.docx", undefined, { resultFormat: "element_based" } as ExtractionConfig); expect(["application/vnd.openxmlformats-officedocument.wordprocessingml.document"].some((v) => result.mimeType.includes(v))).toBe(true);
|
|
// skipped: field 'elements' not available on result type
|
|
|
|
}, 30000);
|
|
it('config_extraction_timeout: Tests that extraction_timeout_secs config field is accepted and does not affect fast extractions', () => { const result = extractFileSync("pdf/fake_memo.pdf", undefined, { extractionTimeoutSecs: 300 } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
|
|
}, 30000);
|
|
it('config_keywords: Tests keyword extraction via YAKE algorithm', () => { const result = extractFileSync("pdf/fake_memo.pdf", undefined, { keywords: { algorithm: "yake", maxKeywords: 10 } } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'keywords' not available on Node JsExtractionResult
|
|
// skipped: field 'keywords' not available on Node JsExtractionResult
|
|
|
|
}, 30000);
|
|
it('config_pages: Tests page extraction and page marker configuration', () => { const result = extractFileSync("pdf/fake_memo.pdf", undefined, { pages: { extractPages: true, insertPageMarkers: true } } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
expect(["PAGE"].some((v) => result.content.includes(v))).toBe(true);
|
|
|
|
}, 30000);
|
|
it('config_quality_enabled: Tests quality scoring produces a score value in [0.0, 1.0]', () => { const result = extractFileSync("pdf/fake_memo.pdf", undefined, { enableQualityProcessing: true } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'quality_score' not available on result type
|
|
// skipped: field 'quality_score' not available on result type
|
|
// skipped: field 'quality_score' not available on result type
|
|
|
|
}, 30000);
|
|
it('config_security_limits: Tests archive extraction with custom security limits', () => { const result = extractFileSync("archives/documents.zip", undefined, { securityLimits: { maxArchiveSize: 104857600, maxCompressionRatio: 50, maxFilesInArchive: 100 } } as ExtractionConfig); expect(["application/zip", "application/x-zip-compressed"].some((v) => result.mimeType.includes(v))).toBe(true);
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
|
|
}, 30000);
|
|
it('config_tree_sitter: Tests tree-sitter configuration round-trip', () => { const result = extractFileSync("code/hello.py", undefined, { treeSitter: { groups: ["web"], languages: ["python", "rust"], process: { comments: false, diagnostics: false, docstrings: false, exports: true, imports: true, structure: true, symbols: false } } } as ExtractionConfig); expect(result.mimeType.trim()).toBe("text/x-source-code");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(5);
|
|
|
|
}, 30000);
|
|
it('output_format_bytes_markdown: Tests markdown output format via bytes extraction API', async () => { const _content_content = await (await import('node:fs/promises')).readFile('pdf/fake_memo.pdf'); const result = extractBytesSync(_content_content, "application/pdf", { outputFormat: "markdown" } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
|
|
}, 30000);
|
|
it('output_format_markdown: Tests Markdown output format', () => { const result = extractFileSync("pdf/fake_memo.pdf", undefined, { outputFormat: "markdown" } as ExtractionConfig); expect(result.mimeType.trim()).toBe("application/pdf");
|
|
expect(result.content.length).toBeGreaterThanOrEqual(10);
|
|
// skipped: field 'metadata.output_format' not available on result type
|
|
|
|
}, 30000);
|
|
});
|