Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 1500,
chunkOverlap: 200,
},
};
const result = await extractFile("document.pdf", undefined, config);
console.log(`Chunks created: ${result.chunks?.length ?? 0}`);
```

View File

@@ -0,0 +1,23 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 500,
chunkOverlap: 50,
},
};
const result = await extractFile("research_paper.pdf", undefined, config);
if (result.chunks) {
for (const chunk of result.chunks) {
const meta = chunk.metadata;
console.log(`Chunk ${meta.chunkIndex + 1}/${meta.totalChunks}`);
console.log(`Position: ${meta.byteStart}-${meta.byteEnd}`);
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
}
```

View File

@@ -0,0 +1,52 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
function validateExtractionConfig(config: unknown): config is ExtractionConfig {
if (!config || typeof config !== "object") {
return false;
}
const cfg = config as Record<string, unknown>;
if (cfg.ocr && typeof cfg.ocr === "object") {
const ocr = cfg.ocr as Record<string, unknown>;
if (ocr.language && typeof ocr.language !== "string") {
return false;
}
if (ocr.backend && typeof ocr.backend !== "string") {
return false;
}
}
if (cfg.chunking && typeof cfg.chunking === "object") {
const chunking = cfg.chunking as Record<string, unknown>;
if (chunking.maxChars && typeof chunking.maxChars !== "number") {
return false;
}
if (chunking.chunkOverlap && typeof chunking.chunkOverlap !== "number") {
return false;
}
}
if (cfg.images && typeof cfg.images === "object") {
const images = cfg.images as Record<string, unknown>;
if (images.extractImages && typeof images.extractImages !== "boolean") {
return false;
}
if (images.targetDpi && typeof images.targetDpi !== "number") {
return false;
}
}
return true;
}
const testConfig = {
ocr: { backend: "tesseract-wasm", language: "eng" },
chunking: { maxChars: 1000 },
};
if (validateExtractionConfig(testConfig)) {
console.log("Configuration is valid");
} else {
console.log("Configuration is invalid");
}

View File

@@ -0,0 +1,44 @@
import { extractBytes, fileToUint8Array, initWasm } from "@kreuzberg/wasm";
async function convertAndExtract(file: File) {
await initWasm();
try {
if (file.size > 512 * 1024 * 1024) {
throw new Error("File exceeds 512 MB limit");
}
const bytes = await fileToUint8Array(file);
console.log(`Converted ${file.name} (${bytes.byteLength} bytes) to Uint8Array`);
const result = await extractBytes(bytes, file.type);
return result;
} catch (error) {
console.error("Conversion failed:", error);
throw error;
}
}
function createBlobFromResult(result: any): Blob {
const json = JSON.stringify({
content: result.content,
mimeType: result.mimeType,
metadata: result.metadata,
});
return new Blob([json], { type: "application/json" });
}
async function demonstrateConversion() {
const file = new File([new ArrayBuffer(100)], "test.pdf", { type: "application/pdf" });
try {
const result = await convertAndExtract(file);
const blob = createBlobFromResult(result);
console.log("Result blob:", blob);
} catch (error) {
console.error(error);
}
}
demonstrateConversion();

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.3,
},
};
const result = await extractFile("research_paper.pdf", undefined, config);
console.log(`Content length: ${result.content.length}`);
console.log(`Keywords: ${JSON.stringify(result.metadata?.keywords ?? [])}`);
```

View File

@@ -0,0 +1,30 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function detectAndExtract(bytes: Uint8Array) {
await initWasm();
const magic = bytes.slice(0, 8);
const magicStr = Array.from(magic)
.map((b) => b.toString(16).padStart(2, "0"))
.join("");
console.log("Magic bytes:", magicStr);
let mimeType = "application/octet-stream";
if (magicStr.startsWith("25504446")) mimeType = "application/pdf";
else if (magicStr.startsWith("504b0304")) mimeType = "application/zip";
else if (magicStr.startsWith("ffd8ff")) mimeType = "image/jpeg";
else if (magicStr.startsWith("89504e47")) mimeType = "image/png";
else if (magicStr.startsWith("474946")) mimeType = "image/gif";
console.log("Detected MIME type:", mimeType);
const result = await extractBytes(bytes, mimeType);
return result;
}
const testBytes = new Uint8Array([0x25, 0x50, 0x44, 0x46]);
detectAndExtract(testBytes)
.then((r) => console.log(r))
.catch(console.error);

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const result = await extractFile("scanned_document.pdf", undefined, config);
console.log(`Content length: ${result.content.length} characters`);
if (result.qualityScore !== undefined && result.qualityScore !== null) {
console.log(`Quality score: ${result.qualityScore.toFixed(2)}`);
}
```

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "moderate",
preserveImportantWords: true,
},
};
const result = await extractFile("document.pdf", undefined, config);
console.log(result.content);
```

View File

@@ -0,0 +1,16 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "moderate",
preserveImportantWords: true,
},
};
const result = await extractFile("verbose_document.pdf", undefined, config);
console.log(`Content length: ${result.content.length}`);
console.log(`MIME type: ${result.mimeType}`);
```

View File

@@ -0,0 +1,47 @@
import type { Chunk, ExtractedImage, ExtractionResult, Table } from "@kreuzberg/wasm";
function _isTable(obj: unknown): obj is Table {
if (!obj || typeof obj !== "object") return false;
const t = obj as Record<string, unknown>;
return (
Array.isArray(t.cells) && typeof t.markdown === "string" && typeof t.pageNumber === "number"
);
}
function _isChunk(obj: unknown): obj is Chunk {
if (!obj || typeof obj !== "object") return false;
const c = obj as Record<string, unknown>;
return (
typeof c.content === "string" &&
c.metadata &&
typeof c.metadata === "object" &&
typeof (c.metadata as Record<string, unknown>).charStart === "number"
);
}
function _isExtractedImage(obj: unknown): obj is ExtractedImage {
if (!obj || typeof obj !== "object") return false;
const i = obj as Record<string, unknown>;
return (
i.data instanceof Uint8Array && typeof i.format === "string" && typeof i.imageIndex === "number"
);
}
function isExtractionResult(obj: unknown): obj is ExtractionResult {
if (!obj || typeof obj !== "object") return false;
const r = obj as Record<string, unknown>;
return (
typeof r.content === "string" &&
typeof r.mimeType === "string" &&
r.metadata &&
typeof r.metadata === "object" &&
Array.isArray(r.tables)
);
}
const result = { content: "text", mimeType: "application/pdf", metadata: {}, tables: [] };
if (isExtractionResult(result)) {
console.log("Valid extraction result");
console.log("Has tables:", result.tables.length > 0);
}