Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 1500,
chunkOverlap: 300,
},
includeDocumentStructure: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Map chunks back to pages for source attribution
const chunkPageMap = new Map<number, number[]>();
result.chunks?.forEach((chunk, chunkIndex) => {
const firstPage = chunk.metadata?.firstPage;
const lastPage = chunk.metadata?.lastPage;
if (firstPage !== undefined && lastPage !== undefined) {
for (let page = firstPage; page <= lastPage; page++) {
if (!chunkPageMap.has(page)) {
chunkPageMap.set(page, []);
}
chunkPageMap.get(page)!.push(chunkIndex);
}
}
});
// Use the mapping for source attribution
chunkPageMap.forEach((chunkIndices, pageNum) => {
console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
chunkIndices.forEach((idx) => {
const chunk = result.chunks![idx];
console.log(` Content: "${chunk.content.substring(0, 60)}..."`);
});
});
```
**Snippet:syntax-only** - Requires document structure parsing during extraction.

View File

@@ -0,0 +1,70 @@
```typescript title="WASM - Fixed-Size Chunks"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 2000,
chunkOverlap: 400,
trim: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
console.log(` Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
console.log(` Content: "${chunk.content.substring(0, 50)}..."`);
});
```
```typescript title="WASM - Markdown-Aware Chunking"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 1500,
prependHeadingContext: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk) => {
// Content already includes heading context prepended
console.log(chunk.content.substring(0, 80));
console.log(
` Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
);
});
```
```typescript title="WASM - Semantic Chunking with Topic Threshold"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
chunkerType: "semantic",
maxChars: 1000,
topicThreshold: 0.5, // Boundary detection at 50% topic change
chunkOverlap: 100,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
console.log(`Generated ${result.chunks?.length} semantic chunks`);
result.chunks?.forEach((chunk) => {
console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
});
```

View File

@@ -0,0 +1,80 @@
```typescript title="WASM - Chunking for RAG Pipeline"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 512, // Smaller chunks for vector DB efficiency
chunkOverlap: 100,
trim: true,
},
includeDocumentStructure: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Prepare chunks for vector database ingestion
const ragChunks =
result.chunks?.map((chunk, idx) => ({
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
text: chunk.content,
metadata: {
source: result.metadata?.filename,
chunk_index: chunk.metadata?.chunkIndex,
total_chunks: chunk.metadata?.totalChunks,
first_page: chunk.metadata?.firstPage,
last_page: chunk.metadata?.lastPage,
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
},
// Embedding would be added by vector DB embedding model
})) || [];
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
ragChunks.slice(0, 3).forEach((c) => {
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
});
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
// id: c.id,
// values: await embedModel.embed(c.text),
// metadata: c.metadata,
// })));
```
```typescript title="WASM - RAG with Retrieval Context"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 768,
chunkOverlap: 200,
prependHeadingContext: true, // For markdown/docs
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
// Build retrieval augmented context
interface RagDocument {
id: string;
query_text: string;
context: string;
page: number | null;
}
const ragDocs: RagDocument[] =
result.chunks?.map((chunk, idx) => ({
id: `chunk_${idx}`,
query_text: chunk.content, // Text to embed and search
context: chunk.content, // Full context (includes heading)
page: chunk.metadata?.firstPage ?? null,
})) || [];
console.log(`Built RAG documents: ${ragDocs.length}`);
```

View File

@@ -0,0 +1,50 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface ProcessingStep {
name: string;
process: (result: ExtractionResult) => Promise<ExtractionResult>;
}
async function createExtractionPipeline(
steps: ProcessingStep[],
bytes: Uint8Array,
mimeType: string,
) {
await initWasm();
let result = await extractBytes(bytes, mimeType);
for (const step of steps) {
console.log(`Executing step: ${step.name}`);
result = await step.process(result);
}
return result;
}
const pipeline: ProcessingStep[] = [
{
name: "Text Normalization",
process: async (result) => ({
...result,
content: result.content.replace(/\s+/g, " ").trim(),
}),
},
{
name: "Language Detection",
process: async (result) => result,
},
{
name: "Chunking",
process: async (result) => result,
},
];
createExtractionPipeline(
pipeline,
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
"application/pdf",
)
.then((r) => console.log("Pipeline complete:", r))
.catch(console.error);

View File

@@ -0,0 +1,34 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
await initWasm();
let lastError: Error | null = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
console.log(`Extraction attempt ${attempt}/${maxRetries}`);
const result = await extractBytes(bytes, mimeType);
console.log("Extraction successful");
return result;
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
console.warn(`Attempt ${attempt} failed:`, lastError.message);
if (attempt < maxRetries) {
const delay = 2 ** attempt * 100;
console.log(`Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
}
throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
}
extractWithRetry(
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
"application/pdf",
)
.then((r) => console.log("Final result:", r))
.catch(console.error);

View File

@@ -0,0 +1,105 @@
```typescript title="WASM - Keyword Extraction Setup"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Note: Keyword extraction requires the 'keywords' feature,
// which may not be available in all WASM builds.
// This example shows the configuration structure.
const config = {
// Extraction configuration
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Keyword extraction would be performed on the extracted text
// using external libraries or post-processing
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
// Example post-processing to extract keywords
// (requires external keyword extraction library)
const keywords = new Set<string>();
const words = result.content
.toLowerCase()
.split(/\s+/)
.filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
words.forEach((word) => {
keywords.add(word);
});
console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
```
```typescript title="WASM - Keyword Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Common stopwords to exclude
const stopwords = new Set([
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"from",
"by",
"is",
"are",
"was",
"were",
"be",
"been",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
]);
// Extract and filter keywords
const text = result.content.toLowerCase();
const words = text.split(/\s+/);
const keywordCounts = new Map<string, number>();
words.forEach((word) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 4 && !stopwords.has(cleaned)) {
keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
}
});
// Get top keywords by frequency
const topKeywords = Array.from(keywordCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10)
.map(([word, count]) => `${word} (${count})`);
console.log(`Top keywords: ${topKeywords.join(", ")}`);
```
<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.

View File

@@ -0,0 +1,110 @@
```typescript title="WASM - Extract and Score Keywords"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// TF-IDF style keyword extraction (simplified)
interface Keyword {
term: string;
frequency: number;
uniquePositions: number[];
score: number;
}
const text = result.content.toLowerCase();
const words = text.split(/[\s\n\t]+/);
const tokenMap = new Map<string, number[]>();
// Record word positions
words.forEach((word, idx) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 3) {
if (!tokenMap.has(cleaned)) {
tokenMap.set(cleaned, []);
}
tokenMap.get(cleaned)!.push(idx);
}
});
// Calculate keyword scores
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
term,
frequency: positions.length,
uniquePositions: positions,
score: positions.length * Math.log(words.length / positions.length),
}));
// Sort by score (TF-IDF approximation)
keywords.sort((a, b) => b.score - a.score);
// Top 15 keywords
const topKeywords = keywords.slice(0, 15);
console.log("Top Keywords:");
topKeywords.forEach((kw) => {
console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
});
```
```typescript title="WASM - Keyword Context Window"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
chunking: {
maxChars: 1000,
chunkOverlap: 200,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface KeywordContext {
keyword: string;
contexts: string[];
}
// Find keyword occurrences with surrounding context
function extractKeywordContexts(
text: string,
keyword: string,
contextWindow: number = 50,
): string[] {
const contexts: string[] = [];
const regex = new RegExp(keyword, "gi");
let match;
while ((match = regex.exec(text)) !== null) {
const start = Math.max(0, match.index - contextWindow);
const end = Math.min(text.length, match.index + keyword.length + contextWindow);
contexts.push(text.substring(start, end));
}
return contexts;
}
// Extract context for top keywords
const topKeywords = ["document", "analysis", "results"];
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
keyword: kw,
contexts: extractKeywordContexts(result.content, kw, 40),
}));
keywordContexts.forEach((kc) => {
console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
kc.contexts.slice(0, 2).forEach((ctx, idx) => {
console.log(` [${idx + 1}] ...${ctx}...`);
});
});
```
<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.

View File

@@ -0,0 +1,84 @@
```typescript title="WASM - Basic Language Detection"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.75,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
```
```typescript title="WASM - Multi-Language Detection"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.6,
detectMultiple: true, // Enable detection of multiple languages
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
// Use detected language for downstream processing
if (result.detectedLanguages && result.detectedLanguages.length > 1) {
console.log("Document contains multiple languages - enable multilingual NLP processing");
}
```
```typescript title="WASM - Language-Specific Extraction"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
// Adjust extraction parameters based on detected language
quality: {
enableQualityProcessing: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/plain", config);
// Process result based on detected language
const language = result.metadata?.language;
console.log(`Processing document in language: ${language}`);
// Example: Apply language-specific rules
const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
en: { cleanWhitespace: true, normalizeText: true },
zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
};
const langConfig = languageConfig[language as string] || {
cleanWhitespace: true,
normalizeText: true,
};
console.log(`Language config: ${JSON.stringify(langConfig)}`);
```

View File

@@ -0,0 +1,171 @@
```typescript title="WASM - Detect and Process Multilingual Content"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.7,
detectMultiple: true,
},
chunking: {
maxChars: 800,
chunkOverlap: 200,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface MultilingualChunk {
index: number;
text: string;
language: string;
confidence: number;
}
// Detect language for each chunk
const multilingualChunks: MultilingualChunk[] =
result.chunks?.map((chunk, idx) => {
// Simple language detection based on character ranges
const text = chunk.content;
let detectedLang = result.metadata?.language || "en";
let confidence = result.metadata?.languageConfidence || 0.5;
// Check for specific character patterns
if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
if (/[぀-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
if (/[가-힯]/.test(text)) detectedLang = "ko"; // Korean
if (/[؀-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
return {
index: idx,
text: text.substring(0, 50),
language: detectedLang,
confidence: confidence,
};
}) || [];
// Group chunks by language
const chunksByLanguage = new Map<string, MultilingualChunk[]>();
multilingualChunks.forEach((chunk) => {
if (!chunksByLanguage.has(chunk.language)) {
chunksByLanguage.set(chunk.language, []);
}
chunksByLanguage.get(chunk.language)!.push(chunk);
});
console.log("Chunks by detected language:");
chunksByLanguage.forEach((chunks, lang) => {
console.log(` ${lang}: ${chunks.length} chunks`);
chunks.slice(0, 2).forEach((c) => {
console.log(` Chunk ${c.index}: "${c.text}..."`);
});
});
```
```typescript title="WASM - Language-Specific Text Processing"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
detectMultiple: true,
},
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/plain", config);
// Language-specific text normalization
interface LanguageProcessor {
normalize: (text: string) => string;
splitSentences: (text: string) => string[];
}
const processors: Record<string, LanguageProcessor> = {
en: {
normalize: (t) => t.replace(/\s+/g, " ").trim(),
splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
},
zh: {
normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
},
ja: {
normalize: (t) => t.replace(/\s+/g, "").trim(),
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
},
ar: {
normalize: (t) => t.replace(/\s+/g, " ").trim(),
splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
},
};
const language = result.metadata?.language || "en";
const processor = processors[language] || processors["en"];
const normalized = processor.normalize(result.content);
const sentences = processor.splitSentences(result.content);
console.log(`Language: ${language}`);
console.log(`Normalized length: ${normalized.length}`);
console.log(`Detected sentences: ${sentences.length}`);
sentences.slice(0, 3).forEach((sent, idx) => {
console.log(` [${idx + 1}] ${sent.substring(0, 60)}...`);
});
```
```typescript title="WASM - Multilingual Chunking Strategy"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface LanguageChunkConfig {
maxChars: number;
overlap: number;
}
// Different chunking strategies per language
const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
default: { maxChars: 512, overlap: 128 },
};
// Detect language first
const languageDetectConfig = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
},
};
const bytes = new Uint8Array(buffer);
const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
const detectedLang = langResult.metadata?.language || "en";
// Re-extract with language-specific chunking
const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
const finalConfig = {
languageDetection: {
enabled: true,
},
chunking: {
maxChars: chunkConfig.maxChars,
chunkOverlap: chunkConfig.overlap,
},
};
const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
console.log(`Language: ${detectedLang}`);
console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
console.log(`Generated ${finalResult.chunks?.length} chunks`);
```

View File

@@ -0,0 +1,37 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function processLargeDocumentSet(files: File[]) {
await initWasm();
const BATCH_SIZE = 5;
const results: any[] = [];
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE);
console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
const batchResults = await Promise.all(
batch.map(async (file) => {
const arrayBuffer = await file.arrayBuffer();
const bytes = new Uint8Array(arrayBuffer);
return extractBytes(bytes, file.type);
}),
);
results.push(...batchResults);
if (global.gc) {
console.log("Running garbage collection");
global.gc();
}
await new Promise((resolve) => setTimeout(resolve, 100));
}
return results;
}
processLargeDocumentSet([])
.then(() => console.log("Done"))
.catch(console.error);

View File

@@ -0,0 +1,43 @@
import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
await initWasm();
const _runtime = detectRuntime();
const canUseWorkers = hasWorkers();
if (isBrowser() && canUseWorkers) {
return extractWithWebWorkers(documents, mimeTypes);
}
return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
}
function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
const workerCount = navigator.hardwareConcurrency ?? 2;
const workers: Worker[] = [];
for (let i = 0; i < workerCount; i++) {
workers.push(new Worker("extraction-worker.js"));
}
return Promise.all(
documents.map(
(bytes, index) =>
new Promise((resolve, reject) => {
const worker = workers[index % workers.length];
worker.postMessage({ bytes, mimeType: mimeTypes[index] });
worker.onmessage = (e) => resolve(e.data);
worker.onerror = reject;
}),
),
);
}
function isBrowser() {
return typeof window !== "undefined";
}
extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
.then((results) => console.log(results))
.catch(console.error);

View File

@@ -0,0 +1,103 @@
```typescript title="WASM - Enable Quality Processing"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
console.log(`Content: ${result.content.substring(0, 100)}...`);
// Quality score indicates text extraction quality (0.0-1.0)
if (result.qualityScore && result.qualityScore < 0.5) {
console.warn("Low quality extraction detected - consider OCR or alternative processing");
}
```
```typescript title="WASM - Quality Monitoring"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface ExtractionQuality {
contentLength: number;
qualityScore: number | null;
assessedAs: string;
}
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const quality: ExtractionQuality = {
contentLength: result.content.length,
qualityScore: result.qualityScore || null,
assessedAs: result.qualityScore
? result.qualityScore > 0.8
? "high"
: result.qualityScore > 0.5
? "medium"
: "low"
: "unknown",
};
console.log("Extraction Quality Report:");
console.log(` Content size: ${quality.contentLength} bytes`);
console.log(` Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
console.log(` Assessment: ${quality.assessedAs}`);
if (quality.assessedAs === "low") {
console.log(
" Recommendation: Review raw text for encoding issues or consider alternative extraction",
);
}
```
```typescript title="WASM - Quality with OCR Fallback"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
async function extractWithQualityCheck(
bytes: Uint8Array,
mimeType: string,
): Promise<{ content: string; quality: number | null; method: string }> {
const config = {
enableQualityProcessing: true,
};
const result = await extractBytes(bytes, mimeType, config);
const qualityScore = result.qualityScore || 0;
// If quality is low, consider text extraction failed or use OCR
if (qualityScore < 0.5) {
console.warn("Low quality text extraction - alternative processing recommended");
return {
content: result.content,
quality: qualityScore,
method: "degraded-text-extraction",
};
}
return {
content: result.content,
quality: qualityScore,
method: "text-extraction",
};
}
const bytes = new Uint8Array(buffer);
const extracted = await extractWithQualityCheck(bytes, "application/pdf");
console.log(`Extraction method: ${extracted.method}`);
console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
```

View File

@@ -0,0 +1,161 @@
```typescript title="WASM - Assess Text Quality"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface TextQualityMetrics {
contentLength: number;
lineCount: number;
averageLineLength: number;
emptyLineRatio: number;
specialCharRatio: number;
estimatedLanguages: string[];
}
function assessTextQuality(content: string): TextQualityMetrics {
const lines = content.split(/\n+/);
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
const totalChars = content.length;
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
// Simple language detection by character patterns
const detectedLangs: string[] = [];
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
if (/[가-힯]/.test(content)) detectedLangs.push("ko");
return {
contentLength: totalChars,
lineCount: lines.length,
averageLineLength:
nonEmptyLines.length > 0
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
: 0,
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
specialCharRatio: specialChars / totalChars,
estimatedLanguages: detectedLangs,
};
}
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const metrics = assessTextQuality(result.content);
console.log("Text Quality Assessment:");
console.log(` Length: ${metrics.contentLength} characters`);
console.log(
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
);
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```
```typescript title="WASM - Quality-Based Content Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
chunking: {
maxChars: 1000,
chunkOverlap: 200,
trim: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface QualityFilteredChunk {
index: number;
content: string;
quality: number;
kept: boolean;
}
// Filter chunks based on quality heuristics
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
const filteredChunks: QualityFilteredChunk[] =
result.chunks?.map((chunk, idx) => {
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
const hasNumbers = /\d/.test(chunk.content);
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
// Quality score based on content characteristics
const contentQuality =
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
const kept = contentQuality >= qualityThreshold;
return {
index: idx,
content: chunk.content.substring(0, 50),
quality: contentQuality,
kept,
};
}) || [];
const keptChunks = filteredChunks.filter((c) => c.kept);
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
keptChunks.slice(0, 3).forEach((c) => {
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
});
```
```typescript title="WASM - Content Encoding Validation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Validate text encoding quality
interface EncodingValidation {
hasInvalidChars: boolean;
invalidCharCount: number;
replacementRatio: number;
estimatedEncoding: string;
}
function validateEncoding(content: string): EncodingValidation {
// Check for replacement characters (U+FFFD)
const replacementChars = (content.match(/<2F>/g) || []).length;
const hasInvalidChars = replacementChars > 0;
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
// Guess encoding based on content patterns
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
return {
hasInvalidChars,
invalidCharCount: replacementChars,
replacementRatio,
estimatedEncoding,
};
}
const validation = validateEncoding(result.content);
console.log("Content Encoding Validation:");
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
console.log(` Invalid characters: ${validation.invalidCharCount}`);
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
console.log(
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
);
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```

View File

@@ -0,0 +1,39 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractStreamingDocument(url: string) {
await initWasm();
const response = await fetch(url);
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const reader = response.body?.getReader();
if (!reader) throw new Error("No response body");
const chunks: Uint8Array[] = [];
let totalSize = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
totalSize += value.length;
console.log(`Received ${totalSize} bytes`);
}
const fullBuffer = new Uint8Array(totalSize);
let offset = 0;
for (const chunk of chunks) {
fullBuffer.set(chunk, offset);
offset += chunk.length;
}
console.log("Document fully received, extracting...");
const result = await extractBytes(fullBuffer, "application/pdf");
return result;
}
extractStreamingDocument("https://example.com/document.pdf")
.then((r) => console.log(r))
.catch(console.error);

View File

@@ -0,0 +1,75 @@
```typescript title="WASM - Token Reduction Configuration"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "aggressive",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Original content: ${result.content.length} characters`);
console.log(`Preview: ${result.content.substring(0, 100)}...`);
// Token reduction modes:
// - "aggressive": maximum reduction
// - "balanced": moderate reduction
// - "conservative": minimal reduction
```
```typescript title="WASM - Token Reduction Modes"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface TokenReductionResult {
mode: string;
originalSize: number;
reducedSize: number;
reductionRatio: number;
preview: string;
}
async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
const modes = ["conservative", "balanced", "aggressive"];
const results: TokenReductionResult[] = [];
for (const mode of modes) {
const config = {
tokenReduction: {
mode,
preserveImportantWords: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
const originalSize = result.content.length;
const reducedSize = result.content.split(/\s+/).length;
results.push({
mode,
originalSize,
reducedSize,
reductionRatio: 1 - reducedSize / originalSize,
preview: result.content.substring(0, 80),
});
}
return results;
}
const bytes = new Uint8Array(buffer);
const modeComparison = await compareTokenReductionModes(bytes);
console.log("Token Reduction Mode Comparison:");
modeComparison.forEach((r) => {
console.log(` ${r.mode}:`);
console.log(` Original: ${r.originalSize} chars`);
console.log(` Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
});
```

View File

@@ -0,0 +1,149 @@
```typescript title="WASM - Token Counting and Cost Estimation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "balanced",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Simple token counting (approximation: 1 token ≈ 4 chars)
function estimateTokenCount(text: string): number {
return Math.ceil(text.length / 4);
}
// LLM pricing (example: GPT-4 Turbo)
interface PricingEstimate {
tokenCount: number;
inputCost: number;
outputCostEstimate: number;
totalEstimate: number;
}
const tokenCount = estimateTokenCount(result.content);
const inputPricePerToken = 0.00001; // $0.01/1K tokens
const outputPricePerToken = 0.00003; // $0.03/1K tokens
const costEstimate: PricingEstimate = {
tokenCount,
inputCost: tokenCount * inputPricePerToken,
outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
};
console.log("Token and Cost Analysis:");
console.log(` Estimated tokens: ${costEstimate.tokenCount}`);
console.log(` Input cost: $${costEstimate.inputCost.toFixed(6)}`);
console.log(` Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
console.log(` Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
```
```typescript title="WASM - Token Reduction for Context Windows"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface ContextWindowFit {
contentLength: number;
estimatedTokens: number;
fitsInWindow: boolean;
utilization: number;
}
function checkContextWindowFit(
content: string,
contextWindowSize: number = 4096,
): ContextWindowFit {
const estimatedTokens = Math.ceil(content.length / 4);
const fitsInWindow = estimatedTokens < contextWindowSize;
const utilization = estimatedTokens / contextWindowSize;
return {
contentLength: content.length,
estimatedTokens,
fitsInWindow,
utilization,
};
}
const config = {
tokenReduction: {
mode: "aggressive", // Use aggressive mode for large documents
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const contextFit = checkContextWindowFit(result.content, 4096);
console.log("Context Window Analysis:");
console.log(` Content: ${contextFit.contentLength} characters`);
console.log(` Tokens (est.): ${contextFit.estimatedTokens}`);
console.log(` Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
console.log(` Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
if (!contextFit.fitsInWindow) {
console.log(" Note: Consider chunking or more aggressive token reduction");
}
```
```typescript title="WASM - Selective Token Preservation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "balanced",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Extract important terms manually
interface ImportantTerm {
term: string;
frequency: number;
importance: number;
}
function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
const words = content.toLowerCase().split(/\s+/);
const frequencyMap = new Map<string, number>();
words.forEach((word) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 5) {
// Only consider longer words
frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
}
});
return Array.from(frequencyMap.entries())
.filter(([_, freq]) => freq >= threshold)
.map(([term, freq]) => ({
term,
frequency: freq,
importance: Math.log(freq) * (term.length / 10),
}))
.sort((a, b) => b.importance - a.importance)
.slice(0, 20);
}
const importantTerms = extractImportantTerms(result.content);
console.log("Important Terms (likely preserved by token reduction):");
importantTerms.forEach((t) => {
console.log(` "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
});
```

View File

@@ -0,0 +1,57 @@
class ExtractionWorker {
private workerPool: Worker[] = [];
private taskQueue: Array<{
bytes: Uint8Array;
mimeType: string;
resolve: (value: any) => void;
reject: (error: any) => void;
}> = [];
constructor(workerCount?: number) {
const count =
workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
for (let i = 0; i < count; i++) {
const worker = new Worker("extraction-worker.js");
worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
worker.onerror = (e) => this.handleWorkerError(worker, e);
this.workerPool.push(worker);
}
}
async extract(bytes: Uint8Array, mimeType: string) {
return new Promise((resolve, reject) => {
this.taskQueue.push({ bytes, mimeType, resolve, reject });
this.processTasks();
});
}
private processTasks() {
const availableWorker = this.workerPool[0];
const task = this.taskQueue.shift();
if (!task || !availableWorker) return;
availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
this.workerPool.push(this.workerPool.shift()!);
}
private handleWorkerMessage(_worker: Worker, result: any) {
const task = this.taskQueue.shift();
if (task) {
task.resolve(result);
this.processTasks();
}
}
private handleWorkerError(_worker: Worker, error: any) {
const task = this.taskQueue.shift();
if (task) {
task.reject(error);
this.processTasks();
}
}
terminate() {
this.workerPool.forEach((w) => w.terminate());
}
}

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const urls = ["document1.pdf", "document2.pdf"];
const results = await Promise.all(
urls.map(async (url) => {
const resp = await fetch(url);
const bytes = new Uint8Array(await resp.arrayBuffer());
return extractBytes(bytes, "application/pdf", undefined);
}),
);
results.forEach((result, i) => {
console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const input = document.getElementById("files") as HTMLInputElement;
const files = Array.from(input.files ?? []);
const results = await Promise.all(
files.map(async (file) => {
const bytes = new Uint8Array(await file.arrayBuffer());
return extractBytes(bytes, file.type || "application/pdf", undefined);
}),
);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,28 @@
```typescript title="WASM"
// HTTP client approach for chunking text via the REST API
// Useful in browsers where WASM extraction is called server-side
const text = "This is a long document that needs to be split into semantic chunks.";
const response = await fetch("http://localhost:8000/chunk", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text,
chunker_type: "text",
config: {
chunking: {
strategy: "semantic",
max_chunk_size: 512,
overlap: 50,
},
},
}),
});
const result = await response.json();
console.log(`Created ${result.chunks?.length ?? 0} chunks`);
result.chunks?.forEach((chunk) => {
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
});
```

View File

@@ -0,0 +1,22 @@
```typescript title="WASM"
// HTTP client approach for file extraction via multipart upload
// Use this when uploading files from a browser form
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const formData = new FormData();
formData.append("file", file);
formData.append("mime_type", file.type || "application/octet-stream");
const response = await fetch("http://localhost:8000/extract/file", {
method: "POST",
body: formData,
});
const result = await response.json();
console.log(`Extracted ${result.content.length} characters`);
console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
}
```

View File

@@ -0,0 +1,69 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
// Build a comprehensive extraction config
const config = {
use_cache: true,
enable_quality_processing: true,
output_format: "markdown",
include_document_structure: true,
// Chunking configuration
chunking: {
strategy: "semantic",
max_chunk_size: 1024,
overlap: 100,
},
// Image extraction configuration
images: {
extract_images: true,
extract_base64: false,
extract_raw_bytes: false,
},
// OCR configuration
ocr: {
backend: "tesseract",
languages: ["eng"],
enabled: true,
},
// HTML-specific extraction options
html_options: "article, main, .content",
// PDF-specific options
pdf_options: {
ocr_strategy: "auto",
preserve_images: true,
},
// Security limits
security_limits: {
max_archive_size: 524288000,
max_file_count: 10000,
max_compression_ratio: 100,
},
};
try {
const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
console.log(`Content: ${result.content.substring(0, 100)}...`);
console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
if (result.images?.length) {
console.log(`Images: ${result.images.length}`);
}
} catch (err) {
console.error("Extraction failed:", err);
}
}
```

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error("Extraction failed:", message);
}
}
```

View File

@@ -0,0 +1,40 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Note: WASM has no native batch API; use Promise.all with per-item error handling
const files = document.getElementById("files") as HTMLInputElement;
const fileList = files.files || [];
// Extract multiple files concurrently (simulated batch)
const extractionPromises = Array.from(fileList).map(async (file) => {
try {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
return { file: file.name, success: true, result };
} catch (err) {
return {
file: file.name,
success: false,
error: err instanceof Error ? err.message : String(err),
};
}
});
const results = await Promise.all(extractionPromises);
// Process results with per-item error handling
results.forEach((item) => {
if (item.success) {
console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
} else {
console.error(`✗ ${item.file}: ${item.error}`);
}
});
// Summary
const succeeded = results.filter((r) => r.success).length;
const failed = results.filter((r) => !r.success).length;
console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
try {
const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Success: ${result.content.length} characters`);
} catch (error) {
if (error instanceof Error) {
console.error("Extraction error:", error.message);
}
}
```

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
```

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
// WASM exposes only async extractBytes; await it from any async context.
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(result.content);
```

View File

@@ -0,0 +1,14 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(`Content length: ${result.content.length} characters`);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
```

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(result.content);
console.log(`Tables: ${result.tables?.length ?? 0}`);
}
```

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
result.tables?.forEach((table) => {
console.log(`Table with ${table.cells?.length ?? 0} rows`);
if (table.markdown) {
console.log(table.markdown);
}
table.cells?.forEach((row) => console.log(row.join(" | ")));
});
}
```

30
docs/snippets/wasm/cache/ocr-cache.ts vendored Normal file
View File

@@ -0,0 +1,30 @@
import { extractBytes, initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
async function demonstrateOcrCaching() {
await initWasm();
const backend = new TesseractWasmBackend();
await backend.initialize();
console.log("Tesseract WASM backend loaded - models cached");
const imageBytes = new Uint8Array(await fetch("page1.png").then((r) => r.arrayBuffer()));
console.time("First OCR (with model load)");
const _result1 = await extractBytes(imageBytes, "image/png", {
ocr: { backend: "tesseract-wasm", language: "eng" },
});
console.timeEnd("First OCR (with model load)");
console.log("Model cached in memory");
const imageBytes2 = new Uint8Array(await fetch("page2.png").then((r) => r.arrayBuffer()));
console.time("Second OCR (model cached)");
const _result2 = await extractBytes(imageBytes2, "image/png", {
ocr: { backend: "tesseract-wasm", language: "eng" },
});
console.timeEnd("Second OCR (model cached)");
}
demonstrateOcrCaching().catch(console.error);

View File

@@ -0,0 +1,57 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
class ExtractionCache {
private cache = new Map<string, ExtractionResult>();
private fileHashes = new Map<File, string>();
async getHash(file: File): Promise<string> {
if (this.fileHashes.has(file)) {
return this.fileHashes.get(file)!;
}
const buffer = await file.arrayBuffer();
const hashBuffer = await crypto.subtle.digest("SHA-256", buffer);
const hashArray = Array.from(new Uint8Array(hashBuffer));
const hashStr = hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
this.fileHashes.set(file, hashStr);
return hashStr;
}
async extract(file: File): Promise<ExtractionResult> {
const hash = await this.getHash(file);
if (this.cache.has(hash)) {
console.log("Cache hit for", file.name);
return this.cache.get(hash)!;
}
console.log("Cache miss for", file.name);
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type);
this.cache.set(hash, result);
return result;
}
clear() {
this.cache.clear();
this.fileHashes.clear();
}
getSize() {
return this.cache.size;
}
}
async function demonstrateCaching() {
await initWasm();
const cache = new ExtractionCache();
const _result = await cache.extract(new File([], "test.pdf"));
console.log("Cache size:", cache.getSize());
}
demonstrateCaching().catch(console.error);

View File

@@ -0,0 +1,43 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
async function _cacheResultInSessionStorage(file: File): Promise<ExtractionResult> {
await initWasm();
const cacheKey = `extraction_${file.name}_${file.size}`;
const cached = sessionStorage.getItem(cacheKey);
if (cached) {
console.log("Loading from session storage");
return JSON.parse(cached);
}
console.log("Extracting and caching result");
const result = await extractFromFile(file);
try {
sessionStorage.setItem(cacheKey, JSON.stringify(result));
} catch (error) {
if (error instanceof Error && error.name === "QuotaExceededError") {
console.warn("Session storage full, skipping cache");
}
}
return result;
}
async function clearExtractionCache() {
const keys = Object.keys(sessionStorage);
let cleared = 0;
for (const key of keys) {
if (key.startsWith("extraction_")) {
sessionStorage.removeItem(key);
cleared++;
}
}
console.log(`Cleared ${cleared} cached results`);
}
clearExtractionCache();

View File

@@ -0,0 +1,35 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const config = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
enable_language_detection: true,
enable_quality: true,
};
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
}
}
```

View File

@@ -0,0 +1,29 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
images: {
extractImages: true,
targetDpi: 200,
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("Extraction complete");
console.log("Content length:", result.content.length);
}
extractWithConfig().catch(console.error);

View File

@@ -0,0 +1,66 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithChunking() {
await initWasm();
const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
chunking: {
maxChars: 800,
chunkOverlap: 150,
splitOnNewlines: true,
splitOnSentences: true,
},
};
// Example: prepend heading context so each chunk carries its heading breadcrumb
const configWithHeadings: ExtractionConfig = {
chunking: {
chunkerType: "markdown",
maxChars: 800,
prependHeadingContext: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
if (result.chunks) {
console.log(`Total chunks: ${result.chunks.length}`);
result.chunks.slice(0, 3).forEach((chunk, i) => {
console.log(`\nChunk ${i}:`);
console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
console.log(`Content: ${chunk.content.substring(0, 100)}...`);
});
}
}
async function extractWithPrependHeadingContext() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
chunking: {
chunkerType: "markdown",
maxChars: 800,
prependHeadingContext: true,
},
};
const result = await extractBytes(bytes, "text/markdown", config);
if (result.chunks) {
console.log(`Total chunks: ${result.chunks.length}`);
result.chunks.slice(0, 3).forEach((chunk, i) => {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
});
}
}
extractWithChunking().catch(console.error);
extractWithPrependHeadingContext().catch(console.error);

View File

@@ -0,0 +1,71 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
console.log(`Tokens: ${chunk.metadata?.token_count}`);
});
```
```typescript title="WASM - Markdown with Heading Context"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 2000,
// Note: Token-based sizing is not available in WASM builds.
// Use character-based sizing instead.
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
if (chunk.metadata?.headingContext?.headings) {
console.log("Headings:");
chunk.metadata.headingContext.headings.forEach((h) => {
console.log(` Level ${h.level}: ${h.text}`);
});
}
});
```
```typescript title="WASM - Prepend Heading Context"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 2000,
prependHeadingContext: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk, idx) => {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
});
```

View File

@@ -0,0 +1,34 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithFullConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "deu",
},
chunking: {
maxChars: 1000,
chunkOverlap: 200,
},
images: {
extractImages: true,
targetDpi: 200,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("=== Extraction Results ===");
console.log(`Content: ${result.content.length} chars`);
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
console.log(`Images: ${result.images?.length ?? 0}`);
console.log(`Tables: ${result.tables.length}`);
console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
}
extractWithFullConfig().catch(console.error);

View File

@@ -0,0 +1,34 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithConditionalConfig(fileSize: number) {
await initWasm();
const config: ExtractionConfig = {};
if (fileSize > 10 * 1024 * 1024) {
config.chunking = {
maxChars: 500,
chunkOverlap: 50,
};
}
if (fileSize < 1 * 1024 * 1024) {
config.images = {
extractImages: true,
targetDpi: 300,
};
}
config.ocr = {
enabled: fileSize < 50 * 1024 * 1024,
};
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "application/pdf", config);
return result;
}
extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
images: {
extractImages: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config = {
ocr: {
backend: "tesseract",
language: "eng",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Content length: ${result.content.length}`);
console.log(`Tables detected: ${result.tables?.length || 0}`);
```

View File

@@ -0,0 +1,26 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
use_cache: true,
ocr: {
backend: "tesseract",
language: "eng+deu",
tesseract_config: {
psm: 6,
},
},
chunking: {
max_characters: 1000,
overlap: 200,
},
enable_quality_processing: true,
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Content length: ${result.content.length}`);
```

View File

@@ -0,0 +1,15 @@
```typescript title="Document Structure Config (WASM)"
import { extractBytes } from "kreuzberg-wasm";
const config = {
includeDocumentStructure: true,
};
const result = extractBytes(fileBuffer, "application/pdf", config);
if (result.document) {
for (const node of result.document.nodes) {
console.log(`[${node.content.nodeType}]`);
}
}
```

View File

@@ -0,0 +1,35 @@
```typescript title="Element-Based Output (WASM)"
import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
// Configure element-based output
const config: ExtractionConfig = {
outputFormat: "element_based",
};
// Extract document
const result = extractFileSync(fileBuffer, "application/pdf", config);
// Access elements
for (const element of result.elements) {
console.log(`Type: ${element.elementType}`);
console.log(`Text: ${element.text.slice(0, 100)}`);
if (element.metadata.pageNumber) {
console.log(`Page: ${element.metadata.pageNumber}`);
}
if (element.metadata.coordinates) {
const coords = element.metadata.coordinates;
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
}
console.log("---");
}
// Filter by element type
const titles = result.elements.filter((e) => e.elementType === "title");
for (const title of titles) {
const level = title.metadata.additional?.level || "unknown";
console.log(`[${level}] ${title.text}`);
}
```

View File

@@ -0,0 +1,105 @@
import {
type ExtractionConfig,
type HierarchyConfig,
Kreuzberg,
type PdfConfig,
} from "kreuzberg-wasm";
// Example 1: Basic hierarchy extraction
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
// Extract bounding box information for spatial layout awareness.
const hierarchyConfigBasic: HierarchyConfig = {
enabled: true,
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
includeBbox: true, // Include bounding box coordinates
ocrCoverageThreshold: undefined, // No OCR coverage threshold
};
const pdfConfigBasic: PdfConfig = {
hierarchy: hierarchyConfigBasic,
};
const extractionConfigBasic: ExtractionConfig = {
pdfOptions: pdfConfigBasic,
};
// const kreuzberg = new Kreuzberg(extractionConfigBasic);
// const result = await kreuzberg.extractFile("document.pdf");
// Example 2: Custom kClusters for minimal structure
// Use 3 clusters for simpler hierarchy with minimal structure.
// Useful when you only need major section divisions (Main, Subsection, Detail).
const hierarchyConfigMinimal: HierarchyConfig = {
enabled: true,
kClusters: 3, // Minimal clustering: just 3 levels
includeBbox: true,
ocrCoverageThreshold: undefined,
};
const pdfConfigMinimal: PdfConfig = {
hierarchy: hierarchyConfigMinimal,
};
const _extractionConfigMinimal: ExtractionConfig = {
pdfOptions: pdfConfigMinimal,
};
// const result = await kreuzberg.extractFile("document.pdf");
// Example 3: With OCR coverage threshold
// Trigger OCR if less than 50% of text has font data.
// Useful for documents with mixed digital and scanned content.
const hierarchyConfigOcr: HierarchyConfig = {
enabled: true,
kClusters: 6,
includeBbox: true,
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
};
const pdfConfigOcr: PdfConfig = {
hierarchy: hierarchyConfigOcr,
};
const _extractionConfigOcr: ExtractionConfig = {
pdfOptions: pdfConfigOcr,
};
// const result = await kreuzberg.extractFile("document.pdf");
// Integration with Kreuzberg WASM instance
async function _extractWithHierarchy(): Promise<void> {
const config = extractionConfigBasic;
const kreuzberg = new Kreuzberg(config);
try {
// Extract from file (requires file input or fetch)
const result = await kreuzberg.extractFile("document.pdf");
console.log("Extraction complete:", result);
} catch (error) {
console.error("Extraction failed:", error);
}
}
// Field descriptions:
//
// enabled: boolean (default: true)
// - Enable or disable hierarchy extraction
// - When false, hierarchy structure is not analyzed
//
// kClusters: number (default: 6, valid: 1-7)
// - Number of font size clusters for hierarchy levels
// - 6 provides H1-H6 heading levels with body text
// - Higher values create more fine-grained hierarchy
// - Lower values create simpler structure
//
// includeBbox: boolean (default: true)
// - Include bounding box coordinates in hierarchy blocks
// - Required for spatial layout awareness and document structure
// - Set to false only if space optimization is critical
//
// ocrCoverageThreshold: number | undefined (default: undefined)
// - Range: 0.0 to 1.0
// - Triggers OCR when text block coverage falls below this fraction
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
// - undefined means no OCR coverage-based triggering
//

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
output_format: "html",
html_output: {
theme: "github",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(result.content); // HTML with kb-* classes
```

View File

@@ -0,0 +1,31 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractImagesWithConfig() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
images: {
extractImages: true,
targetDpi: 300,
maxDimension: 2048,
preserveAspectRatio: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
if (result.images) {
console.log(`Extracted ${result.images.length} images`);
result.images.forEach((image) => {
console.log(
`Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
);
});
}
}
extractImagesWithConfig().catch(console.error);

View File

@@ -0,0 +1,65 @@
import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
// Example 1: Basic YAKE configuration
// Uses YAKE algorithm with default parameters and English stopword filtering
async function basicYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 10,
minScore: 0.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 2: Advanced YAKE with custom parameters
// Fine-tunes YAKE with custom window size for co-occurrence analysis
async function _advancedYake(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "yake",
maxKeywords: 15,
minScore: 0.1,
ngramRange: [1, 2],
language: "en",
yakeParams: {
windowSize: 1,
},
rakeParams: null,
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
// Example 3: RAKE configuration
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
async function _rakeConfig(): Promise<void> {
const config: ExtractionConfig = {
keywords: {
algorithm: "rake",
maxKeywords: 10,
minScore: 5.0,
ngramRange: [1, 3],
language: "en",
yakeParams: null,
rakeParams: {
minWordLength: 1,
maxWordsPerPhrase: 3,
},
},
};
const result = await extractFile("document.pdf", null, config);
console.log("Keywords:", result.keywords);
}
basicYake();

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
keywords: {
algorithm: "yake",
max_keywords: 10,
min_score: 0.1,
ngram_range: [1, 3],
language: "en",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
```

View File

@@ -0,0 +1,19 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
language_detection: {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Detected language: ${result.language}`);
console.log(`Confidence: ${result.language_confidence}`);
```

View File

@@ -0,0 +1,26 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithOcr() {
await initWasm();
const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("Extracted text from scanned document:");
console.log(result.content);
if (result.detectedLanguages) {
console.log("Detected languages:", result.detectedLanguages);
}
}
extractWithOcr().catch(console.error);

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
images: {
extract_images: true,
target_dpi: 300,
max_image_dimension: 4096,
auto_adjust_dpi: true,
min_dpi: 150,
max_dpi: 600,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Extracted images: ${result.images?.length || 0}`);
```

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
const config = {
pdf_options: {
extract_images: true,
passwords: ["password123"],
extract_metadata: true,
hierarchy: {},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Title: ${result.metadata?.title}`);
console.log(`Authors: ${result.metadata?.authors}`);
```

View File

@@ -0,0 +1,22 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
pdf_options: {
hierarchy: {
enabled: true,
detection_threshold: 0.75,
ocr_coverage_threshold: 0.8,
min_level: 1,
max_level: 5,
},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
postprocessor: {
enabled: true,
enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Processed content: ${result.content}`);
```

View File

@@ -0,0 +1,16 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
enable_quality_processing: true,
use_cache: true,
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Quality score: ${result.quality_score}`);
console.log(`Processing time: ${result.processing_time}`);
```

View File

@@ -0,0 +1,21 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
const config = {
ocr: {
backend: "tesseract",
language: "eng+deu",
tesseract_config: {
psm: 6,
oem: 3,
},
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`OCR text: ${result.content}`);
```

View File

@@ -0,0 +1,18 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config = {
token_reduction: {
mode: "moderate",
preserve_important_words: true,
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(`Original tokens: ${result.token_count}`);
console.log(`Reduced content: ${result.content}`);
```

View File

@@ -0,0 +1,26 @@
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
const caps = getWasmCapabilities();
if (!caps.hasWasm) {
throw new Error("WebAssembly not supported");
}
await initWasm();
const results = await Promise.all(
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])),
);
return results.map((r) => ({
content: r.content,
pageCount: r.metadata?.pageCount,
}));
}
const fileBytes = [new Uint8Array([1, 2, 3])];
const mimes = ["application/pdf"];
extractDocuments(fileBytes, mimes)
.then((results) => console.log(results))
.catch(console.error);

View File

@@ -0,0 +1,17 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function main() {
await initWasm();
const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf");
console.log("Extracted content:");
console.log(result.content);
console.log("MIME type:", result.mimeType);
console.log("Metadata:", result.metadata);
}
main().catch(console.error);

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(result.content);
console.log(`MIME Type: ${result.mime_type}`);
```

View File

@@ -0,0 +1,33 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface DocumentJob {
name: string;
bytes: Uint8Array;
mimeType: string;
}
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
await initWasm();
const results: Record<string, string> = {};
const queue = [...documents];
const workers = Array(concurrency)
.fill(null)
.map(async () => {
while (queue.length > 0) {
const doc = queue.shift();
if (!doc) break;
try {
const result = await extractBytes(doc.bytes, doc.mimeType);
results[doc.name] = result.content;
} catch (error) {
console.error(`Failed to process ${doc.name}:`, error);
}
}
});
await Promise.all(workers);
return results;
}

View File

@@ -0,0 +1,14 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInputs = document.getElementById("files") as HTMLInputElement;
const files = Array.from(fileInputs.files || []);
const results = await Promise.all(files.map((file) => extractFromFile(file)));
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
```

View File

@@ -0,0 +1,29 @@
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
async function setupFileInput() {
await initWasm();
const fileInput = document.getElementById("file-input") as HTMLInputElement;
fileInput.addEventListener("change", async (event) => {
const file = (event.target as HTMLInputElement).files?.[0];
if (!file) return;
try {
const result = await extractFromFile(file);
console.log("Extracted text:", result.content);
displayResults(result);
} catch (error) {
console.error("Extraction failed:", error);
}
});
}
function displayResults(result: any) {
const output = document.getElementById("output");
if (output) {
output.textContent = `${result.content.substring(0, 500)}...`;
}
}
setupFileInput().catch(console.error);

View File

@@ -0,0 +1,12 @@
```typescript title="WASM"
import { extractBytes, initWasm } from "@kreuzberg/wasm";
await initWasm();
const response = await fetch("document.pdf");
const buffer = await response.arrayBuffer();
const data = new Uint8Array(buffer);
const result = await extractBytes(data, "application/pdf");
console.log(result.content);
```

View File

@@ -0,0 +1,10 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const result = await extractFile("document.pdf", undefined, undefined);
console.log(`Extracted content: ${result.content}`);
console.log(`Tables found: ${result.tables?.length ?? 0}`);
console.log(`Format: ${result.metadata?.format ?? "unknown"}`);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
const content = result.content;
const tableCount = result.tables.length;
console.log(`Content length: ${content.length} characters`);
console.log(`Tables: ${tableCount}`);
}
```

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
```

View File

@@ -0,0 +1,20 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("scanned.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const config = {
force_ocr: true,
ocr: {
backend: "tesseract",
language: "eng",
},
};
const result = await extractBytes(data, "application/pdf", config);
console.log(result.content);
console.log(`Detected languages: ${result.detected_languages?.join(", ") ?? "unknown"}`);
```

View File

@@ -0,0 +1,9 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const data = new Uint8Array([0x25, 0x50, 0x44, 0x46]); // PDF magic bytes
const result = await extractBytes(data, "application/pdf", undefined);
console.log(result.content);
```

View File

@@ -0,0 +1,25 @@
import { getVersion, getWasmCapabilities, initWasm, isInitialized } from "@kreuzberg/wasm";
async function initializeKreuzberg() {
const caps = getWasmCapabilities();
if (!caps.hasWasm) {
console.error("WebAssembly not supported");
return;
}
try {
if (!isInitialized()) {
await initWasm();
}
const version = getVersion();
console.log(`Kreuzberg ${version} initialized successfully`);
console.log("Workers available:", caps.hasWorkers);
console.log("SharedArrayBuffer available:", caps.hasSharedArrayBuffer);
} catch (error) {
console.error("Initialization failed:", error);
}
}
initializeKreuzberg();

View File

@@ -0,0 +1,6 @@
```typescript title="WASM"
import init from "kreuzberg-wasm";
await init();
console.log("Kreuzberg WASM loaded successfully");
```

View File

@@ -0,0 +1,26 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());
const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Content: ${result.content}`);
console.log(`Success: true`);
console.log(`Content length: ${result.content.length} characters`);
if (result.tables && result.tables.length > 0) {
result.tables.forEach((table, i) => {
console.log(`Table ${i}: ${table.rows?.length ?? 0} rows`);
});
}
if (result.chunks && result.chunks.length > 0) {
result.chunks.forEach((chunk, i) => {
console.log(`Chunk ${i}: ${chunk.text?.length ?? 0} characters`);
});
}
```

View File

@@ -0,0 +1,32 @@
import {
detectRuntime,
getWasmCapabilities,
initWasm,
isBrowser,
isBun,
isDeno,
isNode,
} from "@kreuzberg/wasm";
async function setupForRuntime() {
const runtime = detectRuntime();
const caps = getWasmCapabilities();
console.log(`Running in ${runtime} environment`);
console.log(`Workers: ${caps.hasWorkers}`);
console.log(`SharedArrayBuffer: ${caps.hasSharedArrayBuffer}`);
if (isBrowser()) {
console.log("Browser features available");
} else if (isNode()) {
console.log("Node.js features available");
} else if (isDeno()) {
console.log("Deno features available");
} else if (isBun()) {
console.log("Bun features available");
}
await initWasm();
}
setupForRuntime().catch(console.error);

View File

@@ -0,0 +1,29 @@
```typescript title="WASM"
import init, { extractFile } from "kreuzberg-wasm";
await init();
const config = {
structuredExtraction: {
schema: {
type: "object",
properties: {
title: { type: "string" },
authors: { type: "array", items: { type: "string" } },
date: { type: "string" },
},
required: ["title", "authors", "date"],
additionalProperties: false,
},
llm: {
model: "openai/gpt-4o-mini",
},
strict: true,
},
};
const result = await extractFile("paper.pdf", undefined, config);
console.log(result.structuredOutput);
```
<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment. The WASM crate accepts `structuredExtraction` configuration; the LLM call is dispatched through liter-llm's `wasm-http` transport.

View File

@@ -0,0 +1,40 @@
```typescript title="WASM"
// The WASM crate has no MCP client. To integrate with an MCP server,
// drive the kreuzberg CLI from a Node.js host that uses kreuzberg-wasm
// for in-process extraction.
import { spawn } from "node:child_process";
import * as readline from "node:readline";
const mcpProcess = spawn("kreuzberg", ["mcp"]);
const rl = readline.createInterface({
input: mcpProcess.stdout,
output: mcpProcess.stdin,
terminal: false,
});
const request = {
method: "tools/call",
params: {
name: "extract_file",
arguments: {
path: "document.pdf",
async: true,
},
},
};
mcpProcess.stdin.write(`${JSON.stringify(request)}\n`);
rl.on("line", (line) => {
const response = JSON.parse(line);
console.log(response);
mcpProcess.kill();
});
mcpProcess.on("error", (err) => {
console.error("Failed to start MCP process:", err);
});
```
<!-- snippet:syntax-only --> MCP transport is not exported by the WASM crate; this snippet drives the MCP CLI from the same Node host that loads kreuzberg-wasm.

View File

@@ -0,0 +1,22 @@
```typescript title="WASM"
// MCP server is provided by the kreuzberg CLI (Rust binary). The WASM build
// targets browser/Node.js extraction and does not embed a server process.
// Spawn the CLI from a Node.js host that consumes the WASM module separately.
import { spawn } from "node:child_process";
const mcpProcess = spawn("kreuzberg", ["mcp"]);
mcpProcess.stdout.on("data", (data) => {
console.log(`MCP Server: ${data}`);
});
mcpProcess.stderr.on("data", (data) => {
console.error(`MCP Error: ${data}`);
});
mcpProcess.on("error", (err) => {
console.error(`Failed to start MCP server: ${err.message}`);
});
```
<!-- snippet:syntax-only --> The MCP server is a CLI feature; the WASM crate does not export an MCP server entry point. This snippet shows how a Node host that uses kreuzberg-wasm for extraction can also drive the standalone MCP CLI.

View File

@@ -0,0 +1,25 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function getDocumentMetadata() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "application/pdf");
const metadata = result.metadata;
console.log("Document Metadata:");
console.log("Title:", metadata.title);
console.log("Author:", metadata.author);
console.log("Creator:", metadata.creator);
console.log("Subject:", metadata.subject);
console.log("Keywords:", metadata.keywords);
console.log("Pages:", metadata.pageCount);
console.log("Created:", metadata.createdAt);
console.log("Modified:", metadata.modifiedAt);
return metadata;
}
getDocumentMetadata().catch(console.error);

View File

@@ -0,0 +1,35 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface DocumentSummary {
fileName: string;
title: string | undefined;
author: string | undefined;
pageCount: number | undefined;
language: string;
}
async function filterAndSummarizeMetadata(files: string[]): Promise<DocumentSummary[]> {
await initWasm();
const summaries: DocumentSummary[] = [];
for (const fileName of files) {
const bytes = new Uint8Array(await fetch(fileName).then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "application/pdf");
summaries.push({
fileName,
title: result.metadata.title,
author: result.metadata.author,
pageCount: result.metadata.pageCount,
language: result.detectedLanguages?.[0] ?? "unknown",
});
}
return summaries;
}
filterAndSummarizeMetadata(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
.then((summaries) => console.table(summaries))
.catch(console.error);

View File

@@ -0,0 +1,34 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractImageMetadata() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
images: {
extractImages: true,
targetDpi: 150,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
if (result.images) {
result.images.forEach((image, index) => {
console.log(`Image ${index}:`, {
format: image.format,
width: image.width,
height: image.height,
pageNumber: image.pageNumber,
colorspace: image.colorspace,
bitsPerComponent: image.bitsPerComponent,
isMask: image.isMask,
dataSize: image.data.byteLength,
});
});
}
}
extractImageMetadata().catch(console.error);

View File

@@ -0,0 +1,26 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
await init();
const fileBuffer = new Uint8Array(/* your file bytes */);
const mimeType = "text/plain";
const config = new ExtractionConfig({
language_detection: new LanguageDetectionConfig({
enable_detection: true,
target_languages: ["en", "de", "fr"],
}),
});
const result = await extractBytes(fileBuffer, mimeType, config);
if (result.detected_languages) {
console.log("Detected languages:", result.detected_languages);
for (const language of result.detected_languages) {
console.log(`Language: ${language}`);
}
}
```

View File

@@ -0,0 +1,35 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
await init();
const fileBuffer = new Uint8Array(/* your file bytes */);
const mimeType = "text/plain";
const config = new ExtractionConfig({
language_detection: new LanguageDetectionConfig({
enable_detection: true,
target_languages: ["en", "de", "fr", "es", "it", "ja", "zh"],
confidence_threshold: 0.5,
}),
});
const result = await extractBytes(fileBuffer, mimeType, config);
if (result.detected_languages && result.detected_languages.length > 0) {
console.log("Document languages:", result.detected_languages.join(", "));
// Process multi-language content
result.detected_languages.forEach((lang) => {
console.log(`Language detected: ${lang}`);
});
// Access metadata for language info
if (result.metadata && result.metadata.language) {
console.log(`Primary metadata language: ${result.metadata.language}`);
}
} else {
console.log("No languages detected");
}
```

View File

@@ -0,0 +1,33 @@
import type { ExtractionConfig } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithChunkMetadata() {
await initWasm();
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
const config: ExtractionConfig = {
chunking: {
maxChars: 500,
chunkOverlap: 50,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
console.log("Document Metadata:", result.metadata);
if (result.chunks) {
result.chunks.forEach((chunk) => {
console.log("Chunk Metadata:", {
charStart: chunk.metadata.charStart,
charEnd: chunk.metadata.charEnd,
index: chunk.metadata.chunkIndex,
total: chunk.metadata.totalChunks,
tokens: chunk.metadata.tokenCount,
});
});
}
}
extractWithChunkMetadata().catch(console.error);

View File

@@ -0,0 +1,85 @@
```typescript title="WASM"
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
// Access common metadata fields
if (result.metadata.title) {
console.log(`Title: ${result.metadata.title}`);
}
// Access format-specific metadata
const metadata = result.metadata;
// For HTML files
if (metadata.html) {
const htmlMeta = metadata.html;
console.log(`HTML Title: ${htmlMeta.title}`);
console.log(`Description: ${htmlMeta.description}`);
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
}
// Access canonical URL
if (htmlMeta.canonical_url) {
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
}
// Access Open Graph fields
if (htmlMeta.open_graph) {
if (htmlMeta.open_graph["title"]) {
console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
}
if (htmlMeta.open_graph["image"]) {
console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
}
}
// Access Twitter Card fields
if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link: any) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image: any) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
}
}
// PDF-specific fields are at the top level of metadata
if (metadata.pageCount) {
console.log(`Pages: ${metadata.pageCount}`);
}
if (metadata.authors && metadata.authors.length > 0) {
console.log(`Authors: ${metadata.authors.join(", ")}`);
}
}
```

View File

@@ -0,0 +1,36 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
await init();
const fileBuffer = new Uint8Array(/* your file bytes */);
const mimeType = "application/pdf";
const config = new ExtractionConfig({
pages: new PageConfig({
extract_pages: true,
}),
});
const result = await extractBytes(fileBuffer, mimeType, config);
if (result.metadata && result.metadata.pages) {
const pageStructure = result.metadata.pages;
console.log(`Total pages: ${pageStructure.total_count}`);
if (pageStructure.boundaries) {
// Iterate through page boundaries to map content to pages
pageStructure.boundaries.forEach((boundary) => {
const pageText = result.content.substring(
boundary.byte_start,
Math.min(boundary.byte_end, boundary.byte_start + 100),
);
console.log(`Page ${boundary.page_number}:`);
console.log(` Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
console.log(` Preview: ${pageText}...`);
});
}
}
```

View File

@@ -0,0 +1,38 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
await init();
const fileBuffer = new Uint8Array(/* your file bytes */);
const mimeType = "application/pdf";
const config = new ExtractionConfig({
pages: new PageConfig({
extract_pages: true,
}),
});
const result = await extractBytes(fileBuffer, mimeType, config);
if (result.pages) {
console.log(`Total pages extracted: ${result.pages.length}`);
result.pages.forEach((page) => {
console.log(`Page ${page.pageNumber}:`);
console.log(` Content length: ${page.content.length} chars`);
console.log(` Tables: ${page.tables.length}`);
console.log(` Images: ${page.images.length}`);
// Check if page is blank
if (page.isBlank) {
console.log(" This page is blank");
}
// Access page hierarchy if available
if (page.hierarchy) {
console.log(` Hierarchy level: ${page.hierarchy}`);
}
});
}
```

View File

@@ -0,0 +1,43 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
import { ExtractionConfig } from "kreuzberg-wasm";
await init();
const fileBuffer = new Uint8Array(/* your file bytes */);
const mimeType = "application/pdf";
const config = new ExtractionConfig({});
const result = await extractBytes(fileBuffer, mimeType, config);
if (result.tables && result.tables.length > 0) {
console.log(`Found ${result.tables.length} tables`);
result.tables.forEach((table, index) => {
console.log(`\nTable ${index + 1}:`);
console.log(` Page: ${table.pageNumber}`);
console.log(` Markdown representation:`);
console.log(table.markdown);
// Access cell data
const cells = table.cells;
if (cells) {
console.log(` Total cells: ${Object.keys(cells).length}`);
// Iterate through cells (structure depends on how cells are serialized)
for (const rowKey of Object.keys(cells)) {
const row = cells[rowKey];
console.log(` Row ${rowKey}: ${JSON.stringify(row)}`);
}
}
// Access bounding box if available
if (table.boundingBox) {
console.log(` Bounding box: ${table.boundingBox}`);
}
});
} else {
console.log("No tables found in document");
}
```

View File

@@ -0,0 +1,22 @@
<!-- snippet:syntax-only -->
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Cloud OCR backends are not supported in WASM directly.
// WASM runs in a sandboxed environment without direct network access.
// To use cloud OCR services, implement a wrapper on your server
// or use a cloud platform with built-in OCR integration.
const cloudOcrConfig = {
ocr: {
backend: "custom", // Custom backends must be registered via native runtime
language: "eng",
},
};
// This example shows the configuration structure.
// In production, route cloud OCR requests through your backend service.
```

View File

@@ -0,0 +1,27 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithOcr() {
await initWasm();
try {
await enableOcr();
console.log("OCR enabled successfully");
} catch (error) {
console.error("Failed to enable OCR:", error);
return;
}
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
});
console.log("Extracted text:");
console.log(result.content);
}
extractWithOcr().catch(console.error);

View File

@@ -0,0 +1,15 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const imageData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(imageData, "application/pdf", {
images: {
extract_images: true,
},
});
console.log(result.images);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
images: {
extract_images: true,
target_dpi: 300,
max_image_dimension: 2000,
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,29 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractMultilingualDocument() {
await initWasm();
await enableOcr();
const documents = [
{ name: "english.png", lang: "eng" },
{ name: "german.png", lang: "deu" },
{ name: "spanish.png", lang: "spa" },
];
for (const doc of documents) {
const bytes = new Uint8Array(await fetch(doc.name).then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: doc.lang,
},
});
console.log(`${doc.name} (${doc.lang}):`);
console.log(result.content);
console.log("---");
}
}
extractMultilingualDocument().catch(console.error);

View File

@@ -0,0 +1,30 @@
import {
initWasm,
listOcrBackends,
registerOcrBackend,
TesseractWasmBackend,
unregisterOcrBackend,
} from "@kreuzberg/wasm";
async function manageOcrBackends() {
await initWasm();
const backend = new TesseractWasmBackend();
await backend.initialize();
registerOcrBackend(backend);
const backends = listOcrBackends();
console.log("Available OCR backends:", backends);
if (backends.includes("tesseract-wasm")) {
console.log("Tesseract WASM backend is registered");
}
unregisterOcrBackend("tesseract-wasm");
const afterUnregister = listOcrBackends();
console.log("Backends after unregister:", afterUnregister);
}
manageOcrBackends().catch(console.error);

View File

@@ -0,0 +1,37 @@
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithErrorHandling() {
try {
await initWasm();
} catch (error) {
console.error("Failed to initialize WASM:", error);
return;
}
try {
await enableOcr();
} catch (error) {
if (error instanceof Error && error.message.includes("browser")) {
console.warn("OCR not available in this environment, proceeding without OCR");
} else {
throw error;
}
}
try {
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
const result = await extractBytes(bytes, "image/png", {
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
});
console.log("Extraction successful:", result.content.length, "chars");
} catch (error) {
console.error("Extraction failed:", error);
}
}
extractWithErrorHandling().catch(console.error);

View File

@@ -0,0 +1,20 @@
<!-- snippet:syntax-only -->
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// EasyOCR backend is not supported in WASM.
// EasyOCR requires PyTorch and Python runtime, which are unavailable in browser/WASM.
// Use the Tesseract-WASM backend instead, or route requests through a backend service.
const easyOcrConfig = {
ocr: {
backend: "easyocr", // Not supported in WASM
language: "en",
},
};
// This example shows the configuration structure for reference only.
```

View File

@@ -0,0 +1,24 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("scanned.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
ocr: {
backend: "tesseract",
language: "eng",
element_config: {
include_elements: true,
},
},
});
if (result.ocr_elements) {
for (const element of result.ocr_elements) {
console.log("Text:", element.text);
console.log("Confidence:", element.confidence);
}
}
```

View File

@@ -0,0 +1,34 @@
```typescript title="WASM (Browser)"
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
}
```
```typescript title="WASM (Node.js / Deno / Bun)"
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend
const result = await extractFile("./scanned_document.png", "image/png", {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,17 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(documentData, "application/pdf", {
force_ocr: true,
ocr: {
backend: "tesseract",
language: "eng",
},
});
console.log(result.content);
```

View File

@@ -0,0 +1,16 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const multilingualData = await fetch("multilingual.pdf").then((res) => res.arrayBuffer());
const result = await extractBytes(multilingualData, "application/pdf", {
ocr: {
backend: "tesseract",
language: "eng+deu+fra",
},
});
console.log(result.content);
```

Some files were not shown because too many files have changed in this diff Show More