This commit is contained in:
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1500,
|
||||
chunkOverlap: 300,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Map chunks back to pages for source attribution
|
||||
const chunkPageMap = new Map<number, number[]>();
|
||||
|
||||
result.chunks?.forEach((chunk, chunkIndex) => {
|
||||
const firstPage = chunk.metadata?.firstPage;
|
||||
const lastPage = chunk.metadata?.lastPage;
|
||||
|
||||
if (firstPage !== undefined && lastPage !== undefined) {
|
||||
for (let page = firstPage; page <= lastPage; page++) {
|
||||
if (!chunkPageMap.has(page)) {
|
||||
chunkPageMap.set(page, []);
|
||||
}
|
||||
chunkPageMap.get(page)!.push(chunkIndex);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Use the mapping for source attribution
|
||||
chunkPageMap.forEach((chunkIndices, pageNum) => {
|
||||
console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
|
||||
chunkIndices.forEach((idx) => {
|
||||
const chunk = result.chunks![idx];
|
||||
console.log(` Content: "${chunk.content.substring(0, 60)}..."`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Snippet:syntax-only** - Requires document structure parsing during extraction.
|
||||
70
docs/snippets/wasm/advanced/chunking_config.md
Normal file
70
docs/snippets/wasm/advanced/chunking_config.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```typescript title="WASM - Fixed-Size Chunks"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 2000,
|
||||
chunkOverlap: 400,
|
||||
trim: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
|
||||
console.log(` Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
|
||||
console.log(` Content: "${chunk.content.substring(0, 50)}..."`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Markdown-Aware Chunking"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 1500,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk) => {
|
||||
// Content already includes heading context prepended
|
||||
console.log(chunk.content.substring(0, 80));
|
||||
console.log(
|
||||
` Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
|
||||
);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Semantic Chunking with Topic Threshold"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "semantic",
|
||||
maxChars: 1000,
|
||||
topicThreshold: 0.5, // Boundary detection at 50% topic change
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
console.log(`Generated ${result.chunks?.length} semantic chunks`);
|
||||
result.chunks?.forEach((chunk) => {
|
||||
console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
|
||||
});
|
||||
```
|
||||
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```typescript title="WASM - Chunking for RAG Pipeline"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 512, // Smaller chunks for vector DB efficiency
|
||||
chunkOverlap: 100,
|
||||
trim: true,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Prepare chunks for vector database ingestion
|
||||
const ragChunks =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
|
||||
text: chunk.content,
|
||||
metadata: {
|
||||
source: result.metadata?.filename,
|
||||
chunk_index: chunk.metadata?.chunkIndex,
|
||||
total_chunks: chunk.metadata?.totalChunks,
|
||||
first_page: chunk.metadata?.firstPage,
|
||||
last_page: chunk.metadata?.lastPage,
|
||||
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
|
||||
},
|
||||
// Embedding would be added by vector DB embedding model
|
||||
})) || [];
|
||||
|
||||
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
|
||||
ragChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
|
||||
});
|
||||
|
||||
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
|
||||
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
|
||||
// id: c.id,
|
||||
// values: await embedModel.embed(c.text),
|
||||
// metadata: c.metadata,
|
||||
// })));
|
||||
```
|
||||
|
||||
```typescript title="WASM - RAG with Retrieval Context"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 768,
|
||||
chunkOverlap: 200,
|
||||
prependHeadingContext: true, // For markdown/docs
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
// Build retrieval augmented context
|
||||
interface RagDocument {
|
||||
id: string;
|
||||
query_text: string;
|
||||
context: string;
|
||||
page: number | null;
|
||||
}
|
||||
|
||||
const ragDocs: RagDocument[] =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `chunk_${idx}`,
|
||||
query_text: chunk.content, // Text to embed and search
|
||||
context: chunk.content, // Full context (includes heading)
|
||||
page: chunk.metadata?.firstPage ?? null,
|
||||
})) || [];
|
||||
|
||||
console.log(`Built RAG documents: ${ragDocs.length}`);
|
||||
```
|
||||
50
docs/snippets/wasm/advanced/custom-pipeline.ts
Normal file
50
docs/snippets/wasm/advanced/custom-pipeline.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface ProcessingStep {
|
||||
name: string;
|
||||
process: (result: ExtractionResult) => Promise<ExtractionResult>;
|
||||
}
|
||||
|
||||
async function createExtractionPipeline(
|
||||
steps: ProcessingStep[],
|
||||
bytes: Uint8Array,
|
||||
mimeType: string,
|
||||
) {
|
||||
await initWasm();
|
||||
|
||||
let result = await extractBytes(bytes, mimeType);
|
||||
|
||||
for (const step of steps) {
|
||||
console.log(`Executing step: ${step.name}`);
|
||||
result = await step.process(result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const pipeline: ProcessingStep[] = [
|
||||
{
|
||||
name: "Text Normalization",
|
||||
process: async (result) => ({
|
||||
...result,
|
||||
content: result.content.replace(/\s+/g, " ").trim(),
|
||||
}),
|
||||
},
|
||||
{
|
||||
name: "Language Detection",
|
||||
process: async (result) => result,
|
||||
},
|
||||
{
|
||||
name: "Chunking",
|
||||
process: async (result) => result,
|
||||
},
|
||||
];
|
||||
|
||||
createExtractionPipeline(
|
||||
pipeline,
|
||||
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
|
||||
"application/pdf",
|
||||
)
|
||||
.then((r) => console.log("Pipeline complete:", r))
|
||||
.catch(console.error);
|
||||
34
docs/snippets/wasm/advanced/error-recovery.ts
Normal file
34
docs/snippets/wasm/advanced/error-recovery.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
|
||||
await initWasm();
|
||||
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
console.log(`Extraction attempt ${attempt}/${maxRetries}`);
|
||||
const result = await extractBytes(bytes, mimeType);
|
||||
console.log("Extraction successful");
|
||||
return result;
|
||||
} catch (error) {
|
||||
lastError = error instanceof Error ? error : new Error(String(error));
|
||||
console.warn(`Attempt ${attempt} failed:`, lastError.message);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
const delay = 2 ** attempt * 100;
|
||||
console.log(`Retrying in ${delay}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
|
||||
}
|
||||
|
||||
extractWithRetry(
|
||||
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
|
||||
"application/pdf",
|
||||
)
|
||||
.then((r) => console.log("Final result:", r))
|
||||
.catch(console.error);
|
||||
105
docs/snippets/wasm/advanced/keyword_extraction_config.md
Normal file
105
docs/snippets/wasm/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,105 @@
|
||||
```typescript title="WASM - Keyword Extraction Setup"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Note: Keyword extraction requires the 'keywords' feature,
|
||||
// which may not be available in all WASM builds.
|
||||
// This example shows the configuration structure.
|
||||
|
||||
const config = {
|
||||
// Extraction configuration
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Keyword extraction would be performed on the extracted text
|
||||
// using external libraries or post-processing
|
||||
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Example post-processing to extract keywords
|
||||
// (requires external keyword extraction library)
|
||||
const keywords = new Set<string>();
|
||||
const words = result.content
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
|
||||
|
||||
words.forEach((word) => {
|
||||
keywords.add(word);
|
||||
});
|
||||
|
||||
console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Keyword Filtering"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Common stopwords to exclude
|
||||
const stopwords = new Set([
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"from",
|
||||
"by",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
]);
|
||||
|
||||
// Extract and filter keywords
|
||||
const text = result.content.toLowerCase();
|
||||
const words = text.split(/\s+/);
|
||||
const keywordCounts = new Map<string, number>();
|
||||
|
||||
words.forEach((word) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 4 && !stopwords.has(cleaned)) {
|
||||
keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
// Get top keywords by frequency
|
||||
const topKeywords = Array.from(keywordCounts.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 10)
|
||||
.map(([word, count]) => `${word} (${count})`);
|
||||
|
||||
console.log(`Top keywords: ${topKeywords.join(", ")}`);
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.
|
||||
110
docs/snippets/wasm/advanced/keyword_extraction_example.md
Normal file
110
docs/snippets/wasm/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,110 @@
|
||||
```typescript title="WASM - Extract and Score Keywords"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// TF-IDF style keyword extraction (simplified)
|
||||
interface Keyword {
|
||||
term: string;
|
||||
frequency: number;
|
||||
uniquePositions: number[];
|
||||
score: number;
|
||||
}
|
||||
|
||||
const text = result.content.toLowerCase();
|
||||
const words = text.split(/[\s\n\t]+/);
|
||||
const tokenMap = new Map<string, number[]>();
|
||||
|
||||
// Record word positions
|
||||
words.forEach((word, idx) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 3) {
|
||||
if (!tokenMap.has(cleaned)) {
|
||||
tokenMap.set(cleaned, []);
|
||||
}
|
||||
tokenMap.get(cleaned)!.push(idx);
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate keyword scores
|
||||
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
|
||||
term,
|
||||
frequency: positions.length,
|
||||
uniquePositions: positions,
|
||||
score: positions.length * Math.log(words.length / positions.length),
|
||||
}));
|
||||
|
||||
// Sort by score (TF-IDF approximation)
|
||||
keywords.sort((a, b) => b.score - a.score);
|
||||
|
||||
// Top 15 keywords
|
||||
const topKeywords = keywords.slice(0, 15);
|
||||
console.log("Top Keywords:");
|
||||
topKeywords.forEach((kw) => {
|
||||
console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Keyword Context Window"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface KeywordContext {
|
||||
keyword: string;
|
||||
contexts: string[];
|
||||
}
|
||||
|
||||
// Find keyword occurrences with surrounding context
|
||||
function extractKeywordContexts(
|
||||
text: string,
|
||||
keyword: string,
|
||||
contextWindow: number = 50,
|
||||
): string[] {
|
||||
const contexts: string[] = [];
|
||||
const regex = new RegExp(keyword, "gi");
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
const start = Math.max(0, match.index - contextWindow);
|
||||
const end = Math.min(text.length, match.index + keyword.length + contextWindow);
|
||||
contexts.push(text.substring(start, end));
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
// Extract context for top keywords
|
||||
const topKeywords = ["document", "analysis", "results"];
|
||||
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
|
||||
keyword: kw,
|
||||
contexts: extractKeywordContexts(result.content, kw, 40),
|
||||
}));
|
||||
|
||||
keywordContexts.forEach((kc) => {
|
||||
console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
|
||||
kc.contexts.slice(0, 2).forEach((ctx, idx) => {
|
||||
console.log(` [${idx + 1}] ...${ctx}...`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.
|
||||
84
docs/snippets/wasm/advanced/language_detection_config.md
Normal file
84
docs/snippets/wasm/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,84 @@
|
||||
```typescript title="WASM - Basic Language Detection"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.75,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Primary language: ${result.metadata?.language}`);
|
||||
console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
|
||||
console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Multi-Language Detection"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.6,
|
||||
detectMultiple: true, // Enable detection of multiple languages
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Primary language: ${result.metadata?.language}`);
|
||||
console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
|
||||
console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
|
||||
// Use detected language for downstream processing
|
||||
if (result.detectedLanguages && result.detectedLanguages.length > 1) {
|
||||
console.log("Document contains multiple languages - enable multilingual NLP processing");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Language-Specific Extraction"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
},
|
||||
// Adjust extraction parameters based on detected language
|
||||
quality: {
|
||||
enableQualityProcessing: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/plain", config);
|
||||
|
||||
// Process result based on detected language
|
||||
const language = result.metadata?.language;
|
||||
console.log(`Processing document in language: ${language}`);
|
||||
|
||||
// Example: Apply language-specific rules
|
||||
const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
|
||||
en: { cleanWhitespace: true, normalizeText: true },
|
||||
zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
|
||||
ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
|
||||
ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
|
||||
};
|
||||
|
||||
const langConfig = languageConfig[language as string] || {
|
||||
cleanWhitespace: true,
|
||||
normalizeText: true,
|
||||
};
|
||||
console.log(`Language config: ${JSON.stringify(langConfig)}`);
|
||||
```
|
||||
171
docs/snippets/wasm/advanced/language_detection_multilingual.md
Normal file
171
docs/snippets/wasm/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,171 @@
|
||||
```typescript title="WASM - Detect and Process Multilingual Content"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.7,
|
||||
detectMultiple: true,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 800,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface MultilingualChunk {
|
||||
index: number;
|
||||
text: string;
|
||||
language: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
// Detect language for each chunk
|
||||
const multilingualChunks: MultilingualChunk[] =
|
||||
result.chunks?.map((chunk, idx) => {
|
||||
// Simple language detection based on character ranges
|
||||
const text = chunk.content;
|
||||
let detectedLang = result.metadata?.language || "en";
|
||||
let confidence = result.metadata?.languageConfidence || 0.5;
|
||||
|
||||
// Check for specific character patterns
|
||||
if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
|
||||
if (/[-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
|
||||
if (/[가-]/.test(text)) detectedLang = "ko"; // Korean
|
||||
if (/[-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
|
||||
if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
|
||||
|
||||
return {
|
||||
index: idx,
|
||||
text: text.substring(0, 50),
|
||||
language: detectedLang,
|
||||
confidence: confidence,
|
||||
};
|
||||
}) || [];
|
||||
|
||||
// Group chunks by language
|
||||
const chunksByLanguage = new Map<string, MultilingualChunk[]>();
|
||||
multilingualChunks.forEach((chunk) => {
|
||||
if (!chunksByLanguage.has(chunk.language)) {
|
||||
chunksByLanguage.set(chunk.language, []);
|
||||
}
|
||||
chunksByLanguage.get(chunk.language)!.push(chunk);
|
||||
});
|
||||
|
||||
console.log("Chunks by detected language:");
|
||||
chunksByLanguage.forEach((chunks, lang) => {
|
||||
console.log(` ${lang}: ${chunks.length} chunks`);
|
||||
chunks.slice(0, 2).forEach((c) => {
|
||||
console.log(` Chunk ${c.index}: "${c.text}..."`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Language-Specific Text Processing"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
detectMultiple: true,
|
||||
},
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/plain", config);
|
||||
|
||||
// Language-specific text normalization
|
||||
interface LanguageProcessor {
|
||||
normalize: (text: string) => string;
|
||||
splitSentences: (text: string) => string[];
|
||||
}
|
||||
|
||||
const processors: Record<string, LanguageProcessor> = {
|
||||
en: {
|
||||
normalize: (t) => t.replace(/\s+/g, " ").trim(),
|
||||
splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
zh: {
|
||||
normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
|
||||
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
ja: {
|
||||
normalize: (t) => t.replace(/\s+/g, "").trim(),
|
||||
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
ar: {
|
||||
normalize: (t) => t.replace(/\s+/g, " ").trim(),
|
||||
splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
};
|
||||
|
||||
const language = result.metadata?.language || "en";
|
||||
const processor = processors[language] || processors["en"];
|
||||
|
||||
const normalized = processor.normalize(result.content);
|
||||
const sentences = processor.splitSentences(result.content);
|
||||
|
||||
console.log(`Language: ${language}`);
|
||||
console.log(`Normalized length: ${normalized.length}`);
|
||||
console.log(`Detected sentences: ${sentences.length}`);
|
||||
sentences.slice(0, 3).forEach((sent, idx) => {
|
||||
console.log(` [${idx + 1}] ${sent.substring(0, 60)}...`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Multilingual Chunking Strategy"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface LanguageChunkConfig {
|
||||
maxChars: number;
|
||||
overlap: number;
|
||||
}
|
||||
|
||||
// Different chunking strategies per language
|
||||
const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
|
||||
en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
|
||||
zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
|
||||
ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
|
||||
ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
|
||||
default: { maxChars: 512, overlap: 128 },
|
||||
};
|
||||
|
||||
// Detect language first
|
||||
const languageDetectConfig = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
|
||||
const detectedLang = langResult.metadata?.language || "en";
|
||||
|
||||
// Re-extract with language-specific chunking
|
||||
const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
|
||||
const finalConfig = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: chunkConfig.maxChars,
|
||||
chunkOverlap: chunkConfig.overlap,
|
||||
},
|
||||
};
|
||||
|
||||
const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
|
||||
console.log(`Language: ${detectedLang}`);
|
||||
console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
|
||||
console.log(`Generated ${finalResult.chunks?.length} chunks`);
|
||||
```
|
||||
37
docs/snippets/wasm/advanced/memory-management.ts
Normal file
37
docs/snippets/wasm/advanced/memory-management.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function processLargeDocumentSet(files: File[]) {
|
||||
await initWasm();
|
||||
|
||||
const BATCH_SIZE = 5;
|
||||
const results: any[] = [];
|
||||
|
||||
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||||
const batch = files.slice(i, i + BATCH_SIZE);
|
||||
|
||||
console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
|
||||
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (file) => {
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const bytes = new Uint8Array(arrayBuffer);
|
||||
return extractBytes(bytes, file.type);
|
||||
}),
|
||||
);
|
||||
|
||||
results.push(...batchResults);
|
||||
|
||||
if (global.gc) {
|
||||
console.log("Running garbage collection");
|
||||
global.gc();
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
processLargeDocumentSet([])
|
||||
.then(() => console.log("Done"))
|
||||
.catch(console.error);
|
||||
43
docs/snippets/wasm/advanced/parallel-extraction.ts
Normal file
43
docs/snippets/wasm/advanced/parallel-extraction.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
|
||||
await initWasm();
|
||||
|
||||
const _runtime = detectRuntime();
|
||||
const canUseWorkers = hasWorkers();
|
||||
|
||||
if (isBrowser() && canUseWorkers) {
|
||||
return extractWithWebWorkers(documents, mimeTypes);
|
||||
}
|
||||
|
||||
return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
||||
}
|
||||
|
||||
function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
|
||||
const workerCount = navigator.hardwareConcurrency ?? 2;
|
||||
const workers: Worker[] = [];
|
||||
|
||||
for (let i = 0; i < workerCount; i++) {
|
||||
workers.push(new Worker("extraction-worker.js"));
|
||||
}
|
||||
|
||||
return Promise.all(
|
||||
documents.map(
|
||||
(bytes, index) =>
|
||||
new Promise((resolve, reject) => {
|
||||
const worker = workers[index % workers.length];
|
||||
worker.postMessage({ bytes, mimeType: mimeTypes[index] });
|
||||
worker.onmessage = (e) => resolve(e.data);
|
||||
worker.onerror = reject;
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
function isBrowser() {
|
||||
return typeof window !== "undefined";
|
||||
}
|
||||
|
||||
extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
|
||||
.then((results) => console.log(results))
|
||||
.catch(console.error);
|
||||
103
docs/snippets/wasm/advanced/quality_processing_config.md
Normal file
103
docs/snippets/wasm/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,103 @@
|
||||
```typescript title="WASM - Enable Quality Processing"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
console.log(`Content: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Quality score indicates text extraction quality (0.0-1.0)
|
||||
if (result.qualityScore && result.qualityScore < 0.5) {
|
||||
console.warn("Low quality extraction detected - consider OCR or alternative processing");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality Monitoring"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface ExtractionQuality {
|
||||
contentLength: number;
|
||||
qualityScore: number | null;
|
||||
assessedAs: string;
|
||||
}
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const quality: ExtractionQuality = {
|
||||
contentLength: result.content.length,
|
||||
qualityScore: result.qualityScore || null,
|
||||
assessedAs: result.qualityScore
|
||||
? result.qualityScore > 0.8
|
||||
? "high"
|
||||
: result.qualityScore > 0.5
|
||||
? "medium"
|
||||
: "low"
|
||||
: "unknown",
|
||||
};
|
||||
|
||||
console.log("Extraction Quality Report:");
|
||||
console.log(` Content size: ${quality.contentLength} bytes`);
|
||||
console.log(` Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
console.log(` Assessment: ${quality.assessedAs}`);
|
||||
|
||||
if (quality.assessedAs === "low") {
|
||||
console.log(
|
||||
" Recommendation: Review raw text for encoding issues or consider alternative extraction",
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality with OCR Fallback"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
async function extractWithQualityCheck(
|
||||
bytes: Uint8Array,
|
||||
mimeType: string,
|
||||
): Promise<{ content: string; quality: number | null; method: string }> {
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, mimeType, config);
|
||||
const qualityScore = result.qualityScore || 0;
|
||||
|
||||
// If quality is low, consider text extraction failed or use OCR
|
||||
if (qualityScore < 0.5) {
|
||||
console.warn("Low quality text extraction - alternative processing recommended");
|
||||
return {
|
||||
content: result.content,
|
||||
quality: qualityScore,
|
||||
method: "degraded-text-extraction",
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
content: result.content,
|
||||
quality: qualityScore,
|
||||
method: "text-extraction",
|
||||
};
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const extracted = await extractWithQualityCheck(bytes, "application/pdf");
|
||||
|
||||
console.log(`Extraction method: ${extracted.method}`);
|
||||
console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
|
||||
console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
|
||||
```
|
||||
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,161 @@
|
||||
```typescript title="WASM - Assess Text Quality"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface TextQualityMetrics {
|
||||
contentLength: number;
|
||||
lineCount: number;
|
||||
averageLineLength: number;
|
||||
emptyLineRatio: number;
|
||||
specialCharRatio: number;
|
||||
estimatedLanguages: string[];
|
||||
}
|
||||
|
||||
function assessTextQuality(content: string): TextQualityMetrics {
|
||||
const lines = content.split(/\n+/);
|
||||
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
|
||||
const totalChars = content.length;
|
||||
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
|
||||
|
||||
// Simple language detection by character patterns
|
||||
const detectedLangs: string[] = [];
|
||||
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
|
||||
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
|
||||
if (/[-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
|
||||
if (/[가-]/.test(content)) detectedLangs.push("ko");
|
||||
|
||||
return {
|
||||
contentLength: totalChars,
|
||||
lineCount: lines.length,
|
||||
averageLineLength:
|
||||
nonEmptyLines.length > 0
|
||||
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
|
||||
: 0,
|
||||
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
|
||||
specialCharRatio: specialChars / totalChars,
|
||||
estimatedLanguages: detectedLangs,
|
||||
};
|
||||
}
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const metrics = assessTextQuality(result.content);
|
||||
|
||||
console.log("Text Quality Assessment:");
|
||||
console.log(` Length: ${metrics.contentLength} characters`);
|
||||
console.log(
|
||||
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
|
||||
);
|
||||
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
|
||||
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
|
||||
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
|
||||
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality-Based Content Filtering"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
trim: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface QualityFilteredChunk {
|
||||
index: number;
|
||||
content: string;
|
||||
quality: number;
|
||||
kept: boolean;
|
||||
}
|
||||
|
||||
// Filter chunks based on quality heuristics
|
||||
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
|
||||
const filteredChunks: QualityFilteredChunk[] =
|
||||
result.chunks?.map((chunk, idx) => {
|
||||
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
|
||||
const hasNumbers = /\d/.test(chunk.content);
|
||||
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
|
||||
|
||||
// Quality score based on content characteristics
|
||||
const contentQuality =
|
||||
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
|
||||
const kept = contentQuality >= qualityThreshold;
|
||||
|
||||
return {
|
||||
index: idx,
|
||||
content: chunk.content.substring(0, 50),
|
||||
quality: contentQuality,
|
||||
kept,
|
||||
};
|
||||
}) || [];
|
||||
|
||||
const keptChunks = filteredChunks.filter((c) => c.kept);
|
||||
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
|
||||
|
||||
keptChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Content Encoding Validation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Validate text encoding quality
|
||||
interface EncodingValidation {
|
||||
hasInvalidChars: boolean;
|
||||
invalidCharCount: number;
|
||||
replacementRatio: number;
|
||||
estimatedEncoding: string;
|
||||
}
|
||||
|
||||
function validateEncoding(content: string): EncodingValidation {
|
||||
// Check for replacement characters (U+FFFD)
|
||||
const replacementChars = (content.match(/<2F>/g) || []).length;
|
||||
const hasInvalidChars = replacementChars > 0;
|
||||
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
|
||||
|
||||
// Guess encoding based on content patterns
|
||||
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
|
||||
|
||||
return {
|
||||
hasInvalidChars,
|
||||
invalidCharCount: replacementChars,
|
||||
replacementRatio,
|
||||
estimatedEncoding,
|
||||
};
|
||||
}
|
||||
|
||||
const validation = validateEncoding(result.content);
|
||||
|
||||
console.log("Content Encoding Validation:");
|
||||
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
|
||||
console.log(` Invalid characters: ${validation.invalidCharCount}`);
|
||||
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
|
||||
console.log(
|
||||
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
|
||||
);
|
||||
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
39
docs/snippets/wasm/advanced/streaming-extraction.ts
Normal file
39
docs/snippets/wasm/advanced/streaming-extraction.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractStreamingDocument(url: string) {
|
||||
await initWasm();
|
||||
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) throw new Error("No response body");
|
||||
|
||||
const chunks: Uint8Array[] = [];
|
||||
let totalSize = 0;
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
chunks.push(value);
|
||||
totalSize += value.length;
|
||||
console.log(`Received ${totalSize} bytes`);
|
||||
}
|
||||
|
||||
const fullBuffer = new Uint8Array(totalSize);
|
||||
let offset = 0;
|
||||
for (const chunk of chunks) {
|
||||
fullBuffer.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
console.log("Document fully received, extracting...");
|
||||
const result = await extractBytes(fullBuffer, "application/pdf");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
extractStreamingDocument("https://example.com/document.pdf")
|
||||
.then((r) => console.log(r))
|
||||
.catch(console.error);
|
||||
75
docs/snippets/wasm/advanced/token_reduction_config.md
Normal file
75
docs/snippets/wasm/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,75 @@
|
||||
```typescript title="WASM - Token Reduction Configuration"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "aggressive",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Original content: ${result.content.length} characters`);
|
||||
console.log(`Preview: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Token reduction modes:
|
||||
// - "aggressive": maximum reduction
|
||||
// - "balanced": moderate reduction
|
||||
// - "conservative": minimal reduction
|
||||
```
|
||||
|
||||
```typescript title="WASM - Token Reduction Modes"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface TokenReductionResult {
|
||||
mode: string;
|
||||
originalSize: number;
|
||||
reducedSize: number;
|
||||
reductionRatio: number;
|
||||
preview: string;
|
||||
}
|
||||
|
||||
async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
|
||||
const modes = ["conservative", "balanced", "aggressive"];
|
||||
const results: TokenReductionResult[] = [];
|
||||
|
||||
for (const mode of modes) {
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode,
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
const originalSize = result.content.length;
|
||||
const reducedSize = result.content.split(/\s+/).length;
|
||||
|
||||
results.push({
|
||||
mode,
|
||||
originalSize,
|
||||
reducedSize,
|
||||
reductionRatio: 1 - reducedSize / originalSize,
|
||||
preview: result.content.substring(0, 80),
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const modeComparison = await compareTokenReductionModes(bytes);
|
||||
|
||||
console.log("Token Reduction Mode Comparison:");
|
||||
modeComparison.forEach((r) => {
|
||||
console.log(` ${r.mode}:`);
|
||||
console.log(` Original: ${r.originalSize} chars`);
|
||||
console.log(` Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
|
||||
});
|
||||
```
|
||||
149
docs/snippets/wasm/advanced/token_reduction_example.md
Normal file
149
docs/snippets/wasm/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,149 @@
|
||||
```typescript title="WASM - Token Counting and Cost Estimation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "balanced",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Simple token counting (approximation: 1 token ≈ 4 chars)
|
||||
function estimateTokenCount(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
// LLM pricing (example: GPT-4 Turbo)
|
||||
interface PricingEstimate {
|
||||
tokenCount: number;
|
||||
inputCost: number;
|
||||
outputCostEstimate: number;
|
||||
totalEstimate: number;
|
||||
}
|
||||
|
||||
const tokenCount = estimateTokenCount(result.content);
|
||||
const inputPricePerToken = 0.00001; // $0.01/1K tokens
|
||||
const outputPricePerToken = 0.00003; // $0.03/1K tokens
|
||||
|
||||
const costEstimate: PricingEstimate = {
|
||||
tokenCount,
|
||||
inputCost: tokenCount * inputPricePerToken,
|
||||
outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
|
||||
totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
|
||||
};
|
||||
|
||||
console.log("Token and Cost Analysis:");
|
||||
console.log(` Estimated tokens: ${costEstimate.tokenCount}`);
|
||||
console.log(` Input cost: $${costEstimate.inputCost.toFixed(6)}`);
|
||||
console.log(` Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
|
||||
console.log(` Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Token Reduction for Context Windows"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface ContextWindowFit {
|
||||
contentLength: number;
|
||||
estimatedTokens: number;
|
||||
fitsInWindow: boolean;
|
||||
utilization: number;
|
||||
}
|
||||
|
||||
function checkContextWindowFit(
|
||||
content: string,
|
||||
contextWindowSize: number = 4096,
|
||||
): ContextWindowFit {
|
||||
const estimatedTokens = Math.ceil(content.length / 4);
|
||||
const fitsInWindow = estimatedTokens < contextWindowSize;
|
||||
const utilization = estimatedTokens / contextWindowSize;
|
||||
|
||||
return {
|
||||
contentLength: content.length,
|
||||
estimatedTokens,
|
||||
fitsInWindow,
|
||||
utilization,
|
||||
};
|
||||
}
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "aggressive", // Use aggressive mode for large documents
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const contextFit = checkContextWindowFit(result.content, 4096);
|
||||
|
||||
console.log("Context Window Analysis:");
|
||||
console.log(` Content: ${contextFit.contentLength} characters`);
|
||||
console.log(` Tokens (est.): ${contextFit.estimatedTokens}`);
|
||||
console.log(` Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
|
||||
console.log(` Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
|
||||
|
||||
if (!contextFit.fitsInWindow) {
|
||||
console.log(" Note: Consider chunking or more aggressive token reduction");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Selective Token Preservation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "balanced",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Extract important terms manually
|
||||
interface ImportantTerm {
|
||||
term: string;
|
||||
frequency: number;
|
||||
importance: number;
|
||||
}
|
||||
|
||||
function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
|
||||
const words = content.toLowerCase().split(/\s+/);
|
||||
const frequencyMap = new Map<string, number>();
|
||||
|
||||
words.forEach((word) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 5) {
|
||||
// Only consider longer words
|
||||
frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(frequencyMap.entries())
|
||||
.filter(([_, freq]) => freq >= threshold)
|
||||
.map(([term, freq]) => ({
|
||||
term,
|
||||
frequency: freq,
|
||||
importance: Math.log(freq) * (term.length / 10),
|
||||
}))
|
||||
.sort((a, b) => b.importance - a.importance)
|
||||
.slice(0, 20);
|
||||
}
|
||||
|
||||
const importantTerms = extractImportantTerms(result.content);
|
||||
|
||||
console.log("Important Terms (likely preserved by token reduction):");
|
||||
importantTerms.forEach((t) => {
|
||||
console.log(` "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
|
||||
});
|
||||
```
|
||||
57
docs/snippets/wasm/advanced/worker-extraction.ts
Normal file
57
docs/snippets/wasm/advanced/worker-extraction.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
class ExtractionWorker {
|
||||
private workerPool: Worker[] = [];
|
||||
private taskQueue: Array<{
|
||||
bytes: Uint8Array;
|
||||
mimeType: string;
|
||||
resolve: (value: any) => void;
|
||||
reject: (error: any) => void;
|
||||
}> = [];
|
||||
|
||||
constructor(workerCount?: number) {
|
||||
const count =
|
||||
workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
|
||||
for (let i = 0; i < count; i++) {
|
||||
const worker = new Worker("extraction-worker.js");
|
||||
worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
|
||||
worker.onerror = (e) => this.handleWorkerError(worker, e);
|
||||
this.workerPool.push(worker);
|
||||
}
|
||||
}
|
||||
|
||||
async extract(bytes: Uint8Array, mimeType: string) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.taskQueue.push({ bytes, mimeType, resolve, reject });
|
||||
this.processTasks();
|
||||
});
|
||||
}
|
||||
|
||||
private processTasks() {
|
||||
const availableWorker = this.workerPool[0];
|
||||
const task = this.taskQueue.shift();
|
||||
|
||||
if (!task || !availableWorker) return;
|
||||
|
||||
availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
|
||||
this.workerPool.push(this.workerPool.shift()!);
|
||||
}
|
||||
|
||||
private handleWorkerMessage(_worker: Worker, result: any) {
|
||||
const task = this.taskQueue.shift();
|
||||
if (task) {
|
||||
task.resolve(result);
|
||||
this.processTasks();
|
||||
}
|
||||
}
|
||||
|
||||
private handleWorkerError(_worker: Worker, error: any) {
|
||||
const task = this.taskQueue.shift();
|
||||
if (task) {
|
||||
task.reject(error);
|
||||
this.processTasks();
|
||||
}
|
||||
}
|
||||
|
||||
terminate() {
|
||||
this.workerPool.forEach((w) => w.terminate());
|
||||
}
|
||||
}
|
||||
20
docs/snippets/wasm/api/batch_extract_bytes_sync.md
Normal file
20
docs/snippets/wasm/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const urls = ["document1.pdf", "document2.pdf"];
|
||||
|
||||
const results = await Promise.all(
|
||||
urls.map(async (url) => {
|
||||
const resp = await fetch(url);
|
||||
const bytes = new Uint8Array(await resp.arrayBuffer());
|
||||
return extractBytes(bytes, "application/pdf", undefined);
|
||||
}),
|
||||
);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`Document ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
20
docs/snippets/wasm/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/wasm/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const input = document.getElementById("files") as HTMLInputElement;
|
||||
const files = Array.from(input.files ?? []);
|
||||
|
||||
const results = await Promise.all(
|
||||
files.map(async (file) => {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
return extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
}),
|
||||
);
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
28
docs/snippets/wasm/api/client_chunk_text.md
Normal file
28
docs/snippets/wasm/api/client_chunk_text.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```typescript title="WASM"
|
||||
// HTTP client approach for chunking text via the REST API
|
||||
// Useful in browsers where WASM extraction is called server-side
|
||||
|
||||
const text = "This is a long document that needs to be split into semantic chunks.";
|
||||
|
||||
const response = await fetch("http://localhost:8000/chunk", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
chunker_type: "text",
|
||||
config: {
|
||||
chunking: {
|
||||
strategy: "semantic",
|
||||
max_chunk_size: 512,
|
||||
overlap: 50,
|
||||
},
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
console.log(`Created ${result.chunks?.length ?? 0} chunks`);
|
||||
result.chunks?.forEach((chunk) => {
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 50)}...`);
|
||||
});
|
||||
```
|
||||
22
docs/snippets/wasm/api/client_extract_single_file.md
Normal file
22
docs/snippets/wasm/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="WASM"
|
||||
// HTTP client approach for file extraction via multipart upload
|
||||
// Use this when uploading files from a browser form
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const formData = new FormData();
|
||||
formData.append("file", file);
|
||||
formData.append("mime_type", file.type || "application/octet-stream");
|
||||
|
||||
const response = await fetch("http://localhost:8000/extract/file", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
console.log(`Extracted ${result.content.length} characters`);
|
||||
console.log(`Title: ${result.metadata?.title ?? "Unknown"}`);
|
||||
}
|
||||
```
|
||||
69
docs/snippets/wasm/api/combining_all_features.md
Normal file
69
docs/snippets/wasm/api/combining_all_features.md
Normal file
@@ -0,0 +1,69 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
|
||||
// Build a comprehensive extraction config
|
||||
const config = {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
output_format: "markdown",
|
||||
include_document_structure: true,
|
||||
|
||||
// Chunking configuration
|
||||
chunking: {
|
||||
strategy: "semantic",
|
||||
max_chunk_size: 1024,
|
||||
overlap: 100,
|
||||
},
|
||||
|
||||
// Image extraction configuration
|
||||
images: {
|
||||
extract_images: true,
|
||||
extract_base64: false,
|
||||
extract_raw_bytes: false,
|
||||
},
|
||||
|
||||
// OCR configuration
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
languages: ["eng"],
|
||||
enabled: true,
|
||||
},
|
||||
|
||||
// HTML-specific extraction options
|
||||
html_options: "article, main, .content",
|
||||
|
||||
// PDF-specific options
|
||||
pdf_options: {
|
||||
ocr_strategy: "auto",
|
||||
preserve_images: true,
|
||||
},
|
||||
|
||||
// Security limits
|
||||
security_limits: {
|
||||
max_archive_size: 524288000,
|
||||
max_file_count: 10000,
|
||||
max_compression_ratio: 100,
|
||||
},
|
||||
};
|
||||
|
||||
try {
|
||||
const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
|
||||
console.log(`Content: ${result.content.substring(0, 100)}...`);
|
||||
console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
|
||||
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
|
||||
if (result.images?.length) {
|
||||
console.log(`Images: ${result.images.length}`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Extraction failed:", err);
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/wasm/api/error_handling.md
Normal file
19
docs/snippets/wasm/api/error_handling.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
try {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(`Extracted: ${result.content.length} characters`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error("Extraction failed:", message);
|
||||
}
|
||||
}
|
||||
```
|
||||
40
docs/snippets/wasm/api/error_handling_extract.md
Normal file
40
docs/snippets/wasm/api/error_handling_extract.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Note: WASM has no native batch API; use Promise.all with per-item error handling
|
||||
const files = document.getElementById("files") as HTMLInputElement;
|
||||
const fileList = files.files || [];
|
||||
|
||||
// Extract multiple files concurrently (simulated batch)
|
||||
const extractionPromises = Array.from(fileList).map(async (file) => {
|
||||
try {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/octet-stream", undefined);
|
||||
return { file: file.name, success: true, result };
|
||||
} catch (err) {
|
||||
return {
|
||||
file: file.name,
|
||||
success: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
const results = await Promise.all(extractionPromises);
|
||||
|
||||
// Process results with per-item error handling
|
||||
results.forEach((item) => {
|
||||
if (item.success) {
|
||||
console.log(`✓ ${item.file}: ${item.result.content.length} characters`);
|
||||
} else {
|
||||
console.error(`✗ ${item.file}: ${item.error}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Summary
|
||||
const succeeded = results.filter((r) => r.success).length;
|
||||
const failed = results.filter((r) => !r.success).length;
|
||||
console.log(`Extracted ${succeeded}/${results.length} files (${failed} errors)`);
|
||||
```
|
||||
17
docs/snippets/wasm/api/error_handling_wasm.md
Normal file
17
docs/snippets/wasm/api/error_handling_wasm.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
try {
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(`Success: ${result.content.length} characters`);
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
console.error("Extraction error:", error.message);
|
||||
}
|
||||
}
|
||||
```
|
||||
12
docs/snippets/wasm/api/extract_bytes_async.md
Normal file
12
docs/snippets/wasm/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(`Extracted: ${result.content.length} characters`);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
```
|
||||
12
docs/snippets/wasm/api/extract_bytes_sync.md
Normal file
12
docs/snippets/wasm/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
// WASM exposes only async extractBytes; await it from any async context.
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
```
|
||||
14
docs/snippets/wasm/api/extract_file_async.md
Normal file
14
docs/snippets/wasm/api/extract_file_async.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(`Content length: ${result.content.length} characters`);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/wasm/api/extract_file_sync.md
Normal file
15
docs/snippets/wasm/api/extract_file_sync.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="WASM"
|
||||
// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables?.length ?? 0}`);
|
||||
}
|
||||
```
|
||||
21
docs/snippets/wasm/api/tables.md
Normal file
21
docs/snippets/wasm/api/tables.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
|
||||
|
||||
result.tables?.forEach((table) => {
|
||||
console.log(`Table with ${table.cells?.length ?? 0} rows`);
|
||||
if (table.markdown) {
|
||||
console.log(table.markdown);
|
||||
}
|
||||
table.cells?.forEach((row) => console.log(row.join(" | ")));
|
||||
});
|
||||
}
|
||||
```
|
||||
30
docs/snippets/wasm/cache/ocr-cache.ts
vendored
Normal file
30
docs/snippets/wasm/cache/ocr-cache.ts
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
import { extractBytes, initWasm, TesseractWasmBackend } from "@kreuzberg/wasm";
|
||||
|
||||
async function demonstrateOcrCaching() {
|
||||
await initWasm();
|
||||
|
||||
const backend = new TesseractWasmBackend();
|
||||
await backend.initialize();
|
||||
|
||||
console.log("Tesseract WASM backend loaded - models cached");
|
||||
|
||||
const imageBytes = new Uint8Array(await fetch("page1.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
console.time("First OCR (with model load)");
|
||||
const _result1 = await extractBytes(imageBytes, "image/png", {
|
||||
ocr: { backend: "tesseract-wasm", language: "eng" },
|
||||
});
|
||||
console.timeEnd("First OCR (with model load)");
|
||||
|
||||
console.log("Model cached in memory");
|
||||
|
||||
const imageBytes2 = new Uint8Array(await fetch("page2.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
console.time("Second OCR (model cached)");
|
||||
const _result2 = await extractBytes(imageBytes2, "image/png", {
|
||||
ocr: { backend: "tesseract-wasm", language: "eng" },
|
||||
});
|
||||
console.timeEnd("Second OCR (model cached)");
|
||||
}
|
||||
|
||||
demonstrateOcrCaching().catch(console.error);
|
||||
57
docs/snippets/wasm/cache/result-caching.ts
vendored
Normal file
57
docs/snippets/wasm/cache/result-caching.ts
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
class ExtractionCache {
|
||||
private cache = new Map<string, ExtractionResult>();
|
||||
private fileHashes = new Map<File, string>();
|
||||
|
||||
async getHash(file: File): Promise<string> {
|
||||
if (this.fileHashes.has(file)) {
|
||||
return this.fileHashes.get(file)!;
|
||||
}
|
||||
|
||||
const buffer = await file.arrayBuffer();
|
||||
const hashBuffer = await crypto.subtle.digest("SHA-256", buffer);
|
||||
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
||||
const hashStr = hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
|
||||
|
||||
this.fileHashes.set(file, hashStr);
|
||||
return hashStr;
|
||||
}
|
||||
|
||||
async extract(file: File): Promise<ExtractionResult> {
|
||||
const hash = await this.getHash(file);
|
||||
|
||||
if (this.cache.has(hash)) {
|
||||
console.log("Cache hit for", file.name);
|
||||
return this.cache.get(hash)!;
|
||||
}
|
||||
|
||||
console.log("Cache miss for", file.name);
|
||||
const bytes = new Uint8Array(await file.arrayBuffer());
|
||||
const result = await extractBytes(bytes, file.type);
|
||||
|
||||
this.cache.set(hash, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.cache.clear();
|
||||
this.fileHashes.clear();
|
||||
}
|
||||
|
||||
getSize() {
|
||||
return this.cache.size;
|
||||
}
|
||||
}
|
||||
|
||||
async function demonstrateCaching() {
|
||||
await initWasm();
|
||||
|
||||
const cache = new ExtractionCache();
|
||||
|
||||
const _result = await cache.extract(new File([], "test.pdf"));
|
||||
console.log("Cache size:", cache.getSize());
|
||||
}
|
||||
|
||||
demonstrateCaching().catch(console.error);
|
||||
43
docs/snippets/wasm/cache/session-storage.ts
vendored
Normal file
43
docs/snippets/wasm/cache/session-storage.ts
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function _cacheResultInSessionStorage(file: File): Promise<ExtractionResult> {
|
||||
await initWasm();
|
||||
|
||||
const cacheKey = `extraction_${file.name}_${file.size}`;
|
||||
|
||||
const cached = sessionStorage.getItem(cacheKey);
|
||||
if (cached) {
|
||||
console.log("Loading from session storage");
|
||||
return JSON.parse(cached);
|
||||
}
|
||||
|
||||
console.log("Extracting and caching result");
|
||||
const result = await extractFromFile(file);
|
||||
|
||||
try {
|
||||
sessionStorage.setItem(cacheKey, JSON.stringify(result));
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.name === "QuotaExceededError") {
|
||||
console.warn("Session storage full, skipping cache");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
async function clearExtractionCache() {
|
||||
const keys = Object.keys(sessionStorage);
|
||||
let cleared = 0;
|
||||
|
||||
for (const key of keys) {
|
||||
if (key.startsWith("extraction_")) {
|
||||
sessionStorage.removeItem(key);
|
||||
cleared++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Cleared ${cleared} cached results`);
|
||||
}
|
||||
|
||||
clearExtractionCache();
|
||||
35
docs/snippets/wasm/config/advanced_config.md
Normal file
35
docs/snippets/wasm/config/advanced_config.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
enable_language_detection: true,
|
||||
enable_quality: true,
|
||||
};
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file, file.type, config);
|
||||
|
||||
if (result.chunks) {
|
||||
for (const chunk of result.chunks) {
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/wasm/config/basic-config.ts
Normal file
29
docs/snippets/wasm/config/basic-config.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 200,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log("Extraction complete");
|
||||
console.log("Content length:", result.content.length);
|
||||
}
|
||||
|
||||
extractWithConfig().catch(console.error);
|
||||
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithChunking() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 800,
|
||||
chunkOverlap: 150,
|
||||
splitOnNewlines: true,
|
||||
splitOnSentences: true,
|
||||
},
|
||||
};
|
||||
|
||||
// Example: prepend heading context so each chunk carries its heading breadcrumb
|
||||
const configWithHeadings: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
console.log(`\nChunk ${i}:`);
|
||||
console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
|
||||
console.log(`Content: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function extractWithPrependHeadingContext() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractWithChunking().catch(console.error);
|
||||
extractWithPrependHeadingContext().catch(console.error);
|
||||
71
docs/snippets/wasm/config/chunking_config.md
Normal file
71
docs/snippets/wasm/config/chunking_config.md
Normal file
@@ -0,0 +1,71 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
|
||||
console.log(`Tokens: ${chunk.metadata?.token_count}`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Markdown with Heading Context"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 2000,
|
||||
// Note: Token-based sizing is not available in WASM builds.
|
||||
// Use character-based sizing instead.
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
|
||||
|
||||
if (chunk.metadata?.headingContext?.headings) {
|
||||
console.log("Headings:");
|
||||
chunk.metadata.headingContext.headings.forEach((h) => {
|
||||
console.log(` Level ${h.level}: ${h.text}`);
|
||||
});
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Prepend Heading Context"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 2000,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
|
||||
});
|
||||
```
|
||||
34
docs/snippets/wasm/config/combined-config.ts
Normal file
34
docs/snippets/wasm/config/combined-config.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithFullConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "deu",
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("=== Extraction Results ===");
|
||||
console.log(`Content: ${result.content.length} chars`);
|
||||
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
|
||||
console.log(`Images: ${result.images?.length ?? 0}`);
|
||||
console.log(`Tables: ${result.tables.length}`);
|
||||
console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
}
|
||||
|
||||
extractWithFullConfig().catch(console.error);
|
||||
34
docs/snippets/wasm/config/conditional-config.ts
Normal file
34
docs/snippets/wasm/config/conditional-config.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithConditionalConfig(fileSize: number) {
|
||||
await initWasm();
|
||||
|
||||
const config: ExtractionConfig = {};
|
||||
|
||||
if (fileSize > 10 * 1024 * 1024) {
|
||||
config.chunking = {
|
||||
maxChars: 500,
|
||||
chunkOverlap: 50,
|
||||
};
|
||||
}
|
||||
|
||||
if (fileSize < 1 * 1024 * 1024) {
|
||||
config.images = {
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
};
|
||||
}
|
||||
|
||||
config.ocr = {
|
||||
enabled: fileSize < 50 * 1024 * 1024,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));
|
||||
19
docs/snippets/wasm/config/config_basic.md
Normal file
19
docs/snippets/wasm/config/config_basic.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log(result.content);
|
||||
```
|
||||
18
docs/snippets/wasm/config/config_discover.md
Normal file
18
docs/snippets/wasm/config/config_discover.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log(result.content);
|
||||
```
|
||||
18
docs/snippets/wasm/config/config_ocr.md
Normal file
18
docs/snippets/wasm/config/config_ocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
console.log(`Tables detected: ${result.tables?.length || 0}`);
|
||||
```
|
||||
26
docs/snippets/wasm/config/config_programmatic.md
Normal file
26
docs/snippets/wasm/config/config_programmatic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
use_cache: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract_config: {
|
||||
psm: 6,
|
||||
},
|
||||
},
|
||||
chunking: {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
},
|
||||
enable_quality_processing: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
```
|
||||
15
docs/snippets/wasm/config/document_structure_config.md
Normal file
15
docs/snippets/wasm/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="Document Structure Config (WASM)"
|
||||
import { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
const config = {
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const result = extractBytes(fileBuffer, "application/pdf", config);
|
||||
|
||||
if (result.document) {
|
||||
for (const node of result.document.nodes) {
|
||||
console.log(`[${node.content.nodeType}]`);
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/wasm/config/element_based_output.md
Normal file
35
docs/snippets/wasm/config/element_based_output.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="Element-Based Output (WASM)"
|
||||
import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
// Configure element-based output
|
||||
const config: ExtractionConfig = {
|
||||
outputFormat: "element_based",
|
||||
};
|
||||
|
||||
// Extract document
|
||||
const result = extractFileSync(fileBuffer, "application/pdf", config);
|
||||
|
||||
// Access elements
|
||||
for (const element of result.elements) {
|
||||
console.log(`Type: ${element.elementType}`);
|
||||
console.log(`Text: ${element.text.slice(0, 100)}`);
|
||||
|
||||
if (element.metadata.pageNumber) {
|
||||
console.log(`Page: ${element.metadata.pageNumber}`);
|
||||
}
|
||||
|
||||
if (element.metadata.coordinates) {
|
||||
const coords = element.metadata.coordinates;
|
||||
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
|
||||
}
|
||||
|
||||
console.log("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
const titles = result.elements.filter((e) => e.elementType === "title");
|
||||
for (const title of titles) {
|
||||
const level = title.metadata.additional?.level || "unknown";
|
||||
console.log(`[${level}] ${title.text}`);
|
||||
}
|
||||
```
|
||||
105
docs/snippets/wasm/config/hierarchy_config.ts
Normal file
105
docs/snippets/wasm/config/hierarchy_config.ts
Normal file
@@ -0,0 +1,105 @@
|
||||
import {
|
||||
type ExtractionConfig,
|
||||
type HierarchyConfig,
|
||||
Kreuzberg,
|
||||
type PdfConfig,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
const hierarchyConfigBasic: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
includeBbox: true, // Include bounding box coordinates
|
||||
ocrCoverageThreshold: undefined, // No OCR coverage threshold
|
||||
};
|
||||
|
||||
const pdfConfigBasic: PdfConfig = {
|
||||
hierarchy: hierarchyConfigBasic,
|
||||
};
|
||||
|
||||
const extractionConfigBasic: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigBasic,
|
||||
};
|
||||
|
||||
// const kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Example 2: Custom kClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
const hierarchyConfigMinimal: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 3, // Minimal clustering: just 3 levels
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: undefined,
|
||||
};
|
||||
|
||||
const pdfConfigMinimal: PdfConfig = {
|
||||
hierarchy: hierarchyConfigMinimal,
|
||||
};
|
||||
|
||||
const _extractionConfigMinimal: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigMinimal,
|
||||
};
|
||||
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
const hierarchyConfigOcr: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6,
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
const pdfConfigOcr: PdfConfig = {
|
||||
hierarchy: hierarchyConfigOcr,
|
||||
};
|
||||
|
||||
const _extractionConfigOcr: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigOcr,
|
||||
};
|
||||
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Integration with Kreuzberg WASM instance
|
||||
async function _extractWithHierarchy(): Promise<void> {
|
||||
const config = extractionConfigBasic;
|
||||
const kreuzberg = new Kreuzberg(config);
|
||||
|
||||
try {
|
||||
// Extract from file (requires file input or fetch)
|
||||
const result = await kreuzberg.extractFile("document.pdf");
|
||||
console.log("Extraction complete:", result);
|
||||
} catch (error) {
|
||||
console.error("Extraction failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: boolean (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// kClusters: number (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// includeBbox: boolean (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocrCoverageThreshold: number | undefined (default: undefined)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - undefined means no OCR coverage-based triggering
|
||||
//
|
||||
17
docs/snippets/wasm/config/html_output.md
Normal file
17
docs/snippets/wasm/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
output_format: "html",
|
||||
html_output: {
|
||||
theme: "github",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(result.content); // HTML with kb-* classes
|
||||
```
|
||||
31
docs/snippets/wasm/config/image-config.ts
Normal file
31
docs/snippets/wasm/config/image-config.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractImagesWithConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
maxDimension: 2048,
|
||||
preserveAspectRatio: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.images) {
|
||||
console.log(`Extracted ${result.images.length} images`);
|
||||
|
||||
result.images.forEach((image) => {
|
||||
console.log(
|
||||
`Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractImagesWithConfig().catch(console.error);
|
||||
65
docs/snippets/wasm/config/keyword_config.ts
Normal file
65
docs/snippets/wasm/config/keyword_config.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
async function basicYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
async function _advancedYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 15,
|
||||
minScore: 0.1,
|
||||
ngramRange: [1, 2],
|
||||
language: "en",
|
||||
yakeParams: {
|
||||
windowSize: 1,
|
||||
},
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
async function _rakeConfig(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "rake",
|
||||
maxKeywords: 10,
|
||||
minScore: 5.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: {
|
||||
minWordLength: 1,
|
||||
maxWordsPerPhrase: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
basicYake();
|
||||
20
docs/snippets/wasm/config/keyword_extraction_config.md
Normal file
20
docs/snippets/wasm/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: [1, 3],
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
|
||||
```
|
||||
19
docs/snippets/wasm/config/language_detection_config.md
Normal file
19
docs/snippets/wasm/config/language_detection_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
language_detection: {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Detected language: ${result.language}`);
|
||||
console.log(`Confidence: ${result.language_confidence}`);
|
||||
```
|
||||
26
docs/snippets/wasm/config/ocr-config.ts
Normal file
26
docs/snippets/wasm/config/ocr-config.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithOcr() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("Extracted text from scanned document:");
|
||||
console.log(result.content);
|
||||
|
||||
if (result.detectedLanguages) {
|
||||
console.log("Detected languages:", result.detectedLanguages);
|
||||
}
|
||||
}
|
||||
|
||||
extractWithOcr().catch(console.error);
|
||||
21
docs/snippets/wasm/config/ocr_dpi_config.md
Normal file
21
docs/snippets/wasm/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
images: {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4096,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 150,
|
||||
max_dpi: 600,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Extracted images: ${result.images?.length || 0}`);
|
||||
```
|
||||
20
docs/snippets/wasm/config/pdf_config.md
Normal file
20
docs/snippets/wasm/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
pdf_options: {
|
||||
extract_images: true,
|
||||
passwords: ["password123"],
|
||||
extract_metadata: true,
|
||||
hierarchy: {},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Title: ${result.metadata?.title}`);
|
||||
console.log(`Authors: ${result.metadata?.authors}`);
|
||||
```
|
||||
22
docs/snippets/wasm/config/pdf_hierarchy_config.md
Normal file
22
docs/snippets/wasm/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
pdf_options: {
|
||||
hierarchy: {
|
||||
enabled: true,
|
||||
detection_threshold: 0.75,
|
||||
ocr_coverage_threshold: 0.8,
|
||||
min_level: 1,
|
||||
max_level: 5,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
|
||||
```
|
||||
17
docs/snippets/wasm/config/postprocessor_config.md
Normal file
17
docs/snippets/wasm/config/postprocessor_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
postprocessor: {
|
||||
enabled: true,
|
||||
enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Processed content: ${result.content}`);
|
||||
```
|
||||
16
docs/snippets/wasm/config/quality_processing_config.md
Normal file
16
docs/snippets/wasm/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
enable_quality_processing: true,
|
||||
use_cache: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Quality score: ${result.quality_score}`);
|
||||
console.log(`Processing time: ${result.processing_time}`);
|
||||
```
|
||||
21
docs/snippets/wasm/config/tesseract_config.md
Normal file
21
docs/snippets/wasm/config/tesseract_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract_config: {
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`OCR text: ${result.content}`);
|
||||
```
|
||||
18
docs/snippets/wasm/config/token_reduction_config.md
Normal file
18
docs/snippets/wasm/config/token_reduction_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
token_reduction: {
|
||||
mode: "moderate",
|
||||
preserve_important_words: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Original tokens: ${result.token_count}`);
|
||||
console.log(`Reduced content: ${result.content}`);
|
||||
```
|
||||
26
docs/snippets/wasm/getting-started/async-extraction.ts
Normal file
26
docs/snippets/wasm/getting-started/async-extraction.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import { extractBytes, getWasmCapabilities, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractDocuments(files: Uint8Array[], mimeTypes: string[]) {
|
||||
const caps = getWasmCapabilities();
|
||||
if (!caps.hasWasm) {
|
||||
throw new Error("WebAssembly not supported");
|
||||
}
|
||||
|
||||
await initWasm();
|
||||
|
||||
const results = await Promise.all(
|
||||
files.map((bytes, index) => extractBytes(bytes, mimeTypes[index])),
|
||||
);
|
||||
|
||||
return results.map((r) => ({
|
||||
content: r.content,
|
||||
pageCount: r.metadata?.pageCount,
|
||||
}));
|
||||
}
|
||||
|
||||
const fileBytes = [new Uint8Array([1, 2, 3])];
|
||||
const mimes = ["application/pdf"];
|
||||
|
||||
extractDocuments(fileBytes, mimes)
|
||||
.then((results) => console.log(results))
|
||||
.catch(console.error);
|
||||
17
docs/snippets/wasm/getting-started/basic-extract.ts
Normal file
17
docs/snippets/wasm/getting-started/basic-extract.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function main() {
|
||||
await initWasm();
|
||||
|
||||
const buffer = await fetch("document.pdf").then((r) => r.arrayBuffer());
|
||||
const bytes = new Uint8Array(buffer);
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
console.log("Extracted content:");
|
||||
console.log(result.content);
|
||||
console.log("MIME type:", result.mimeType);
|
||||
console.log("Metadata:", result.metadata);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
12
docs/snippets/wasm/getting-started/basic_usage.md
Normal file
12
docs/snippets/wasm/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
console.log(`MIME Type: ${result.mime_type}`);
|
||||
```
|
||||
33
docs/snippets/wasm/getting-started/batch-processing.ts
Normal file
33
docs/snippets/wasm/getting-started/batch-processing.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface DocumentJob {
|
||||
name: string;
|
||||
bytes: Uint8Array;
|
||||
mimeType: string;
|
||||
}
|
||||
|
||||
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
|
||||
await initWasm();
|
||||
|
||||
const results: Record<string, string> = {};
|
||||
const queue = [...documents];
|
||||
|
||||
const workers = Array(concurrency)
|
||||
.fill(null)
|
||||
.map(async () => {
|
||||
while (queue.length > 0) {
|
||||
const doc = queue.shift();
|
||||
if (!doc) break;
|
||||
|
||||
try {
|
||||
const result = await extractBytes(doc.bytes, doc.mimeType);
|
||||
results[doc.name] = result.content;
|
||||
} catch (error) {
|
||||
console.error(`Failed to process ${doc.name}:`, error);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(workers);
|
||||
return results;
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const fileInputs = document.getElementById("files") as HTMLInputElement;
|
||||
const files = Array.from(fileInputs.files || []);
|
||||
|
||||
const results = await Promise.all(files.map((file) => extractFromFile(file)));
|
||||
|
||||
results.forEach((result, i) => {
|
||||
console.log(`File ${i + 1}: ${result.content.length} characters`);
|
||||
});
|
||||
```
|
||||
29
docs/snippets/wasm/getting-started/browser-file-input.ts
Normal file
29
docs/snippets/wasm/getting-started/browser-file-input.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function setupFileInput() {
|
||||
await initWasm();
|
||||
|
||||
const fileInput = document.getElementById("file-input") as HTMLInputElement;
|
||||
|
||||
fileInput.addEventListener("change", async (event) => {
|
||||
const file = (event.target as HTMLInputElement).files?.[0];
|
||||
if (!file) return;
|
||||
|
||||
try {
|
||||
const result = await extractFromFile(file);
|
||||
console.log("Extracted text:", result.content);
|
||||
displayResults(result);
|
||||
} catch (error) {
|
||||
console.error("Extraction failed:", error);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function displayResults(result: any) {
|
||||
const output = document.getElementById("output");
|
||||
if (output) {
|
||||
output.textContent = `${result.content.substring(0, 500)}...`;
|
||||
}
|
||||
}
|
||||
|
||||
setupFileInput().catch(console.error);
|
||||
12
docs/snippets/wasm/getting-started/extract_bytes_sync.md
Normal file
12
docs/snippets/wasm/getting-started/extract_bytes_sync.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```typescript title="WASM"
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const buffer = await response.arrayBuffer();
|
||||
const data = new Uint8Array(buffer);
|
||||
|
||||
const result = await extractBytes(data, "application/pdf");
|
||||
console.log(result.content);
|
||||
```
|
||||
10
docs/snippets/wasm/getting-started/extract_file.md
Normal file
10
docs/snippets/wasm/getting-started/extract_file.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractFile } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const result = await extractFile("document.pdf", undefined, undefined);
|
||||
console.log(`Extracted content: ${result.content}`);
|
||||
console.log(`Tables found: ${result.tables?.length ?? 0}`);
|
||||
console.log(`Format: ${result.metadata?.format ?? "unknown"}`);
|
||||
```
|
||||
17
docs/snippets/wasm/getting-started/extract_file_async.md
Normal file
17
docs/snippets/wasm/getting-started/extract_file_async.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file);
|
||||
const content = result.content;
|
||||
const tableCount = result.tables.length;
|
||||
|
||||
console.log(`Content length: ${content.length} characters`);
|
||||
console.log(`Tables: ${tableCount}`);
|
||||
}
|
||||
```
|
||||
15
docs/snippets/wasm/getting-started/extract_file_sync.md
Normal file
15
docs/snippets/wasm/getting-started/extract_file_sync.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file);
|
||||
console.log(result.content);
|
||||
console.log(`Tables: ${result.tables.length}`);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
}
|
||||
```
|
||||
20
docs/snippets/wasm/getting-started/extract_with_ocr.md
Normal file
20
docs/snippets/wasm/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("scanned.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const config = {
|
||||
force_ocr: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(result.content);
|
||||
console.log(`Detected languages: ${result.detected_languages?.join(", ") ?? "unknown"}`);
|
||||
```
|
||||
9
docs/snippets/wasm/getting-started/hello_world.md
Normal file
9
docs/snippets/wasm/getting-started/hello_world.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array([0x25, 0x50, 0x44, 0x46]); // PDF magic bytes
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
console.log(result.content);
|
||||
```
|
||||
25
docs/snippets/wasm/getting-started/initialization.ts
Normal file
25
docs/snippets/wasm/getting-started/initialization.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { getVersion, getWasmCapabilities, initWasm, isInitialized } from "@kreuzberg/wasm";
|
||||
|
||||
async function initializeKreuzberg() {
|
||||
const caps = getWasmCapabilities();
|
||||
|
||||
if (!caps.hasWasm) {
|
||||
console.error("WebAssembly not supported");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!isInitialized()) {
|
||||
await initWasm();
|
||||
}
|
||||
|
||||
const version = getVersion();
|
||||
console.log(`Kreuzberg ${version} initialized successfully`);
|
||||
console.log("Workers available:", caps.hasWorkers);
|
||||
console.log("SharedArrayBuffer available:", caps.hasSharedArrayBuffer);
|
||||
} catch (error) {
|
||||
console.error("Initialization failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
initializeKreuzberg();
|
||||
6
docs/snippets/wasm/getting-started/install_verify.md
Normal file
6
docs/snippets/wasm/getting-started/install_verify.md
Normal file
@@ -0,0 +1,6 @@
|
||||
```typescript title="WASM"
|
||||
import init from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
console.log("Kreuzberg WASM loaded successfully");
|
||||
```
|
||||
26
docs/snippets/wasm/getting-started/read_content.md
Normal file
26
docs/snippets/wasm/getting-started/read_content.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const response = await fetch("document.pdf");
|
||||
const data = new Uint8Array(await response.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", undefined);
|
||||
|
||||
console.log(`Content: ${result.content}`);
|
||||
console.log(`Success: true`);
|
||||
console.log(`Content length: ${result.content.length} characters`);
|
||||
|
||||
if (result.tables && result.tables.length > 0) {
|
||||
result.tables.forEach((table, i) => {
|
||||
console.log(`Table ${i}: ${table.rows?.length ?? 0} rows`);
|
||||
});
|
||||
}
|
||||
|
||||
if (result.chunks && result.chunks.length > 0) {
|
||||
result.chunks.forEach((chunk, i) => {
|
||||
console.log(`Chunk ${i}: ${chunk.text?.length ?? 0} characters`);
|
||||
});
|
||||
}
|
||||
```
|
||||
32
docs/snippets/wasm/getting-started/runtime-detection.ts
Normal file
32
docs/snippets/wasm/getting-started/runtime-detection.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
import {
|
||||
detectRuntime,
|
||||
getWasmCapabilities,
|
||||
initWasm,
|
||||
isBrowser,
|
||||
isBun,
|
||||
isDeno,
|
||||
isNode,
|
||||
} from "@kreuzberg/wasm";
|
||||
|
||||
async function setupForRuntime() {
|
||||
const runtime = detectRuntime();
|
||||
const caps = getWasmCapabilities();
|
||||
|
||||
console.log(`Running in ${runtime} environment`);
|
||||
console.log(`Workers: ${caps.hasWorkers}`);
|
||||
console.log(`SharedArrayBuffer: ${caps.hasSharedArrayBuffer}`);
|
||||
|
||||
if (isBrowser()) {
|
||||
console.log("Browser features available");
|
||||
} else if (isNode()) {
|
||||
console.log("Node.js features available");
|
||||
} else if (isDeno()) {
|
||||
console.log("Deno features available");
|
||||
} else if (isBun()) {
|
||||
console.log("Bun features available");
|
||||
}
|
||||
|
||||
await initWasm();
|
||||
}
|
||||
|
||||
setupForRuntime().catch(console.error);
|
||||
29
docs/snippets/wasm/llm/structured_extraction.md
Normal file
29
docs/snippets/wasm/llm/structured_extraction.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractFile } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
structuredExtraction: {
|
||||
schema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
title: { type: "string" },
|
||||
authors: { type: "array", items: { type: "string" } },
|
||||
date: { type: "string" },
|
||||
},
|
||||
required: ["title", "authors", "date"],
|
||||
additionalProperties: false,
|
||||
},
|
||||
llm: {
|
||||
model: "openai/gpt-4o-mini",
|
||||
},
|
||||
strict: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("paper.pdf", undefined, config);
|
||||
console.log(result.structuredOutput);
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> Requires network access to the configured LLM provider and a valid API key in the host environment. The WASM crate accepts `structuredExtraction` configuration; the LLM call is dispatched through liter-llm's `wasm-http` transport.
|
||||
40
docs/snippets/wasm/mcp/mcp_custom_client.md
Normal file
40
docs/snippets/wasm/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,40 @@
|
||||
```typescript title="WASM"
|
||||
// The WASM crate has no MCP client. To integrate with an MCP server,
|
||||
// drive the kreuzberg CLI from a Node.js host that uses kreuzberg-wasm
|
||||
// for in-process extraction.
|
||||
import { spawn } from "node:child_process";
|
||||
import * as readline from "node:readline";
|
||||
|
||||
const mcpProcess = spawn("kreuzberg", ["mcp"]);
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input: mcpProcess.stdout,
|
||||
output: mcpProcess.stdin,
|
||||
terminal: false,
|
||||
});
|
||||
|
||||
const request = {
|
||||
method: "tools/call",
|
||||
params: {
|
||||
name: "extract_file",
|
||||
arguments: {
|
||||
path: "document.pdf",
|
||||
async: true,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
mcpProcess.stdin.write(`${JSON.stringify(request)}\n`);
|
||||
|
||||
rl.on("line", (line) => {
|
||||
const response = JSON.parse(line);
|
||||
console.log(response);
|
||||
mcpProcess.kill();
|
||||
});
|
||||
|
||||
mcpProcess.on("error", (err) => {
|
||||
console.error("Failed to start MCP process:", err);
|
||||
});
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> MCP transport is not exported by the WASM crate; this snippet drives the MCP CLI from the same Node host that loads kreuzberg-wasm.
|
||||
22
docs/snippets/wasm/mcp/mcp_server_start.md
Normal file
22
docs/snippets/wasm/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="WASM"
|
||||
// MCP server is provided by the kreuzberg CLI (Rust binary). The WASM build
|
||||
// targets browser/Node.js extraction and does not embed a server process.
|
||||
// Spawn the CLI from a Node.js host that consumes the WASM module separately.
|
||||
import { spawn } from "node:child_process";
|
||||
|
||||
const mcpProcess = spawn("kreuzberg", ["mcp"]);
|
||||
|
||||
mcpProcess.stdout.on("data", (data) => {
|
||||
console.log(`MCP Server: ${data}`);
|
||||
});
|
||||
|
||||
mcpProcess.stderr.on("data", (data) => {
|
||||
console.error(`MCP Error: ${data}`);
|
||||
});
|
||||
|
||||
mcpProcess.on("error", (err) => {
|
||||
console.error(`Failed to start MCP server: ${err.message}`);
|
||||
});
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> The MCP server is a CLI feature; the WASM crate does not export an MCP server entry point. This snippet shows how a Node host that uses kreuzberg-wasm for extraction can also drive the standalone MCP CLI.
|
||||
25
docs/snippets/wasm/metadata/extract-metadata.ts
Normal file
25
docs/snippets/wasm/metadata/extract-metadata.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function getDocumentMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
const metadata = result.metadata;
|
||||
|
||||
console.log("Document Metadata:");
|
||||
console.log("Title:", metadata.title);
|
||||
console.log("Author:", metadata.author);
|
||||
console.log("Creator:", metadata.creator);
|
||||
console.log("Subject:", metadata.subject);
|
||||
console.log("Keywords:", metadata.keywords);
|
||||
console.log("Pages:", metadata.pageCount);
|
||||
console.log("Created:", metadata.createdAt);
|
||||
console.log("Modified:", metadata.modifiedAt);
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
getDocumentMetadata().catch(console.error);
|
||||
35
docs/snippets/wasm/metadata/filter-metadata.ts
Normal file
35
docs/snippets/wasm/metadata/filter-metadata.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface DocumentSummary {
|
||||
fileName: string;
|
||||
title: string | undefined;
|
||||
author: string | undefined;
|
||||
pageCount: number | undefined;
|
||||
language: string;
|
||||
}
|
||||
|
||||
async function filterAndSummarizeMetadata(files: string[]): Promise<DocumentSummary[]> {
|
||||
await initWasm();
|
||||
|
||||
const summaries: DocumentSummary[] = [];
|
||||
|
||||
for (const fileName of files) {
|
||||
const bytes = new Uint8Array(await fetch(fileName).then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
summaries.push({
|
||||
fileName,
|
||||
title: result.metadata.title,
|
||||
author: result.metadata.author,
|
||||
pageCount: result.metadata.pageCount,
|
||||
language: result.detectedLanguages?.[0] ?? "unknown",
|
||||
});
|
||||
}
|
||||
|
||||
return summaries;
|
||||
}
|
||||
|
||||
filterAndSummarizeMetadata(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
||||
.then((summaries) => console.table(summaries))
|
||||
.catch(console.error);
|
||||
34
docs/snippets/wasm/metadata/image-metadata.ts
Normal file
34
docs/snippets/wasm/metadata/image-metadata.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractImageMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 150,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.images) {
|
||||
result.images.forEach((image, index) => {
|
||||
console.log(`Image ${index}:`, {
|
||||
format: image.format,
|
||||
width: image.width,
|
||||
height: image.height,
|
||||
pageNumber: image.pageNumber,
|
||||
colorspace: image.colorspace,
|
||||
bitsPerComponent: image.bitsPerComponent,
|
||||
isMask: image.isMask,
|
||||
dataSize: image.data.byteLength,
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractImageMetadata().catch(console.error);
|
||||
26
docs/snippets/wasm/metadata/language_detection.md
Normal file
26
docs/snippets/wasm/metadata/language_detection.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "text/plain";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
language_detection: new LanguageDetectionConfig({
|
||||
enable_detection: true,
|
||||
target_languages: ["en", "de", "fr"],
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.detected_languages) {
|
||||
console.log("Detected languages:", result.detected_languages);
|
||||
|
||||
for (const language of result.detected_languages) {
|
||||
console.log(`Language: ${language}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "text/plain";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
language_detection: new LanguageDetectionConfig({
|
||||
enable_detection: true,
|
||||
target_languages: ["en", "de", "fr", "es", "it", "ja", "zh"],
|
||||
confidence_threshold: 0.5,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.detected_languages && result.detected_languages.length > 0) {
|
||||
console.log("Document languages:", result.detected_languages.join(", "));
|
||||
|
||||
// Process multi-language content
|
||||
result.detected_languages.forEach((lang) => {
|
||||
console.log(`Language detected: ${lang}`);
|
||||
});
|
||||
|
||||
// Access metadata for language info
|
||||
if (result.metadata && result.metadata.language) {
|
||||
console.log(`Primary metadata language: ${result.metadata.language}`);
|
||||
}
|
||||
} else {
|
||||
console.log("No languages detected");
|
||||
}
|
||||
```
|
||||
33
docs/snippets/wasm/metadata/metadata-with-chunks.ts
Normal file
33
docs/snippets/wasm/metadata/metadata-with-chunks.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithChunkMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 500,
|
||||
chunkOverlap: 50,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("Document Metadata:", result.metadata);
|
||||
|
||||
if (result.chunks) {
|
||||
result.chunks.forEach((chunk) => {
|
||||
console.log("Chunk Metadata:", {
|
||||
charStart: chunk.metadata.charStart,
|
||||
charEnd: chunk.metadata.charEnd,
|
||||
index: chunk.metadata.chunkIndex,
|
||||
total: chunk.metadata.totalChunks,
|
||||
tokens: chunk.metadata.tokenCount,
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractWithChunkMetadata().catch(console.error);
|
||||
85
docs/snippets/wasm/metadata/metadata.md
Normal file
85
docs/snippets/wasm/metadata/metadata.md
Normal file
@@ -0,0 +1,85 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
|
||||
// Access common metadata fields
|
||||
if (result.metadata.title) {
|
||||
console.log(`Title: ${result.metadata.title}`);
|
||||
}
|
||||
|
||||
// Access format-specific metadata
|
||||
const metadata = result.metadata;
|
||||
|
||||
// For HTML files
|
||||
if (metadata.html) {
|
||||
const htmlMeta = metadata.html;
|
||||
console.log(`HTML Title: ${htmlMeta.title}`);
|
||||
console.log(`Description: ${htmlMeta.description}`);
|
||||
|
||||
// Access keywords as array
|
||||
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
|
||||
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
|
||||
}
|
||||
|
||||
// Access canonical URL
|
||||
if (htmlMeta.canonical_url) {
|
||||
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
|
||||
}
|
||||
|
||||
// Access Open Graph fields
|
||||
if (htmlMeta.open_graph) {
|
||||
if (htmlMeta.open_graph["title"]) {
|
||||
console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
|
||||
}
|
||||
if (htmlMeta.open_graph["image"]) {
|
||||
console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access Twitter Card fields
|
||||
if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
|
||||
console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
|
||||
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
|
||||
}
|
||||
|
||||
// Access links
|
||||
if (htmlMeta.links && htmlMeta.links.length > 0) {
|
||||
htmlMeta.links.forEach((link: any) => {
|
||||
console.log(`Link: ${link.href} (${link.text})`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access images
|
||||
if (htmlMeta.images && htmlMeta.images.length > 0) {
|
||||
htmlMeta.images.forEach((image: any) => {
|
||||
console.log(`Image: ${image.src}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
|
||||
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
|
||||
}
|
||||
}
|
||||
|
||||
// PDF-specific fields are at the top level of metadata
|
||||
if (metadata.pageCount) {
|
||||
console.log(`Pages: ${metadata.pageCount}`);
|
||||
}
|
||||
if (metadata.authors && metadata.authors.length > 0) {
|
||||
console.log(`Authors: ${metadata.authors.join(", ")}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
pages: new PageConfig({
|
||||
extract_pages: true,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.metadata && result.metadata.pages) {
|
||||
const pageStructure = result.metadata.pages;
|
||||
console.log(`Total pages: ${pageStructure.total_count}`);
|
||||
|
||||
if (pageStructure.boundaries) {
|
||||
// Iterate through page boundaries to map content to pages
|
||||
pageStructure.boundaries.forEach((boundary) => {
|
||||
const pageText = result.content.substring(
|
||||
boundary.byte_start,
|
||||
Math.min(boundary.byte_end, boundary.byte_start + 100),
|
||||
);
|
||||
|
||||
console.log(`Page ${boundary.page_number}:`);
|
||||
console.log(` Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
|
||||
console.log(` Preview: ${pageText}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/wasm/metadata/page_tracking_basic.md
Normal file
38
docs/snippets/wasm/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
pages: new PageConfig({
|
||||
extract_pages: true,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.pages) {
|
||||
console.log(`Total pages extracted: ${result.pages.length}`);
|
||||
|
||||
result.pages.forEach((page) => {
|
||||
console.log(`Page ${page.pageNumber}:`);
|
||||
console.log(` Content length: ${page.content.length} chars`);
|
||||
console.log(` Tables: ${page.tables.length}`);
|
||||
console.log(` Images: ${page.images.length}`);
|
||||
|
||||
// Check if page is blank
|
||||
if (page.isBlank) {
|
||||
console.log(" This page is blank");
|
||||
}
|
||||
|
||||
// Access page hierarchy if available
|
||||
if (page.hierarchy) {
|
||||
console.log(` Hierarchy level: ${page.hierarchy}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
```
|
||||
43
docs/snippets/wasm/metadata/tables.md
Normal file
43
docs/snippets/wasm/metadata/tables.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.tables && result.tables.length > 0) {
|
||||
console.log(`Found ${result.tables.length} tables`);
|
||||
|
||||
result.tables.forEach((table, index) => {
|
||||
console.log(`\nTable ${index + 1}:`);
|
||||
console.log(` Page: ${table.pageNumber}`);
|
||||
console.log(` Markdown representation:`);
|
||||
console.log(table.markdown);
|
||||
|
||||
// Access cell data
|
||||
const cells = table.cells;
|
||||
if (cells) {
|
||||
console.log(` Total cells: ${Object.keys(cells).length}`);
|
||||
|
||||
// Iterate through cells (structure depends on how cells are serialized)
|
||||
for (const rowKey of Object.keys(cells)) {
|
||||
const row = cells[rowKey];
|
||||
console.log(` Row ${rowKey}: ${JSON.stringify(row)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access bounding box if available
|
||||
if (table.boundingBox) {
|
||||
console.log(` Bounding box: ${table.boundingBox}`);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.log("No tables found in document");
|
||||
}
|
||||
```
|
||||
22
docs/snippets/wasm/ocr/cloud_ocr_backend.md
Normal file
22
docs/snippets/wasm/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,22 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Cloud OCR backends are not supported in WASM directly.
|
||||
// WASM runs in a sandboxed environment without direct network access.
|
||||
// To use cloud OCR services, implement a wrapper on your server
|
||||
// or use a cloud platform with built-in OCR integration.
|
||||
|
||||
const cloudOcrConfig = {
|
||||
ocr: {
|
||||
backend: "custom", // Custom backends must be registered via native runtime
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
// This example shows the configuration structure.
|
||||
// In production, route cloud OCR requests through your backend service.
|
||||
```
|
||||
27
docs/snippets/wasm/ocr/enable-ocr.ts
Normal file
27
docs/snippets/wasm/ocr/enable-ocr.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithOcr() {
|
||||
await initWasm();
|
||||
|
||||
try {
|
||||
await enableOcr();
|
||||
console.log("OCR enabled successfully");
|
||||
} catch (error) {
|
||||
console.error("Failed to enable OCR:", error);
|
||||
return;
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(await fetch("scanned-page.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log("Extracted text:");
|
||||
console.log(result.content);
|
||||
}
|
||||
|
||||
extractWithOcr().catch(console.error);
|
||||
15
docs/snippets/wasm/ocr/image_extraction.md
Normal file
15
docs/snippets/wasm/ocr/image_extraction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const imageData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(imageData, "application/pdf", {
|
||||
images: {
|
||||
extract_images: true,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.images);
|
||||
```
|
||||
17
docs/snippets/wasm/ocr/image_preprocessing.md
Normal file
17
docs/snippets/wasm/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
images: {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 2000,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
29
docs/snippets/wasm/ocr/multi-language-ocr.ts
Normal file
29
docs/snippets/wasm/ocr/multi-language-ocr.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractMultilingualDocument() {
|
||||
await initWasm();
|
||||
await enableOcr();
|
||||
|
||||
const documents = [
|
||||
{ name: "english.png", lang: "eng" },
|
||||
{ name: "german.png", lang: "deu" },
|
||||
{ name: "spanish.png", lang: "spa" },
|
||||
];
|
||||
|
||||
for (const doc of documents) {
|
||||
const bytes = new Uint8Array(await fetch(doc.name).then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: doc.lang,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`${doc.name} (${doc.lang}):`);
|
||||
console.log(result.content);
|
||||
console.log("---");
|
||||
}
|
||||
}
|
||||
|
||||
extractMultilingualDocument().catch(console.error);
|
||||
30
docs/snippets/wasm/ocr/ocr-backend-registration.ts
Normal file
30
docs/snippets/wasm/ocr/ocr-backend-registration.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import {
|
||||
initWasm,
|
||||
listOcrBackends,
|
||||
registerOcrBackend,
|
||||
TesseractWasmBackend,
|
||||
unregisterOcrBackend,
|
||||
} from "@kreuzberg/wasm";
|
||||
|
||||
async function manageOcrBackends() {
|
||||
await initWasm();
|
||||
|
||||
const backend = new TesseractWasmBackend();
|
||||
await backend.initialize();
|
||||
|
||||
registerOcrBackend(backend);
|
||||
|
||||
const backends = listOcrBackends();
|
||||
console.log("Available OCR backends:", backends);
|
||||
|
||||
if (backends.includes("tesseract-wasm")) {
|
||||
console.log("Tesseract WASM backend is registered");
|
||||
}
|
||||
|
||||
unregisterOcrBackend("tesseract-wasm");
|
||||
|
||||
const afterUnregister = listOcrBackends();
|
||||
console.log("Backends after unregister:", afterUnregister);
|
||||
}
|
||||
|
||||
manageOcrBackends().catch(console.error);
|
||||
37
docs/snippets/wasm/ocr/ocr-error-handling.ts
Normal file
37
docs/snippets/wasm/ocr/ocr-error-handling.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { enableOcr, extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithErrorHandling() {
|
||||
try {
|
||||
await initWasm();
|
||||
} catch (error) {
|
||||
console.error("Failed to initialize WASM:", error);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await enableOcr();
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message.includes("browser")) {
|
||||
console.warn("OCR not available in this environment, proceeding without OCR");
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const bytes = new Uint8Array(await fetch("document.png").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "image/png", {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log("Extraction successful:", result.content.length, "chars");
|
||||
} catch (error) {
|
||||
console.error("Extraction failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
extractWithErrorHandling().catch(console.error);
|
||||
20
docs/snippets/wasm/ocr/ocr_easyocr.md
Normal file
20
docs/snippets/wasm/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// EasyOCR backend is not supported in WASM.
|
||||
// EasyOCR requires PyTorch and Python runtime, which are unavailable in browser/WASM.
|
||||
// Use the Tesseract-WASM backend instead, or route requests through a backend service.
|
||||
|
||||
const easyOcrConfig = {
|
||||
ocr: {
|
||||
backend: "easyocr", // Not supported in WASM
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
// This example shows the configuration structure for reference only.
|
||||
```
|
||||
24
docs/snippets/wasm/ocr/ocr_elements.md
Normal file
24
docs/snippets/wasm/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("scanned.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
element_config: {
|
||||
include_elements: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (result.ocr_elements) {
|
||||
for (const element of result.ocr_elements) {
|
||||
console.log("Text:", element.text);
|
||||
console.log("Confidence:", element.confidence);
|
||||
}
|
||||
}
|
||||
```
|
||||
34
docs/snippets/wasm/ocr/ocr_extraction.md
Normal file
34
docs/snippets/wasm/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```typescript title="WASM (Browser)"
|
||||
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
await enableOcr();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file, file.type, {
|
||||
ocr: {
|
||||
backend: "kreuzberg-tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
console.log(result.content);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM (Node.js / Deno / Bun)"
|
||||
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
await enableOcr(); // Uses native kreuzberg-tesseract backend
|
||||
|
||||
const result = await extractFile("./scanned_document.png", "image/png", {
|
||||
ocr: {
|
||||
backend: "kreuzberg-tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
console.log(result.content);
|
||||
```
|
||||
17
docs/snippets/wasm/ocr/ocr_force_all_pages.md
Normal file
17
docs/snippets/wasm/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const documentData = await fetch("document.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(documentData, "application/pdf", {
|
||||
force_ocr: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
16
docs/snippets/wasm/ocr/ocr_multi_language.md
Normal file
16
docs/snippets/wasm/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const multilingualData = await fetch("multilingual.pdf").then((res) => res.arrayBuffer());
|
||||
|
||||
const result = await extractBytes(multilingualData, "application/pdf", {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu+fra",
|
||||
},
|
||||
});
|
||||
|
||||
console.log(result.content);
|
||||
```
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user