This commit is contained in:
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
44
docs/snippets/wasm/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1500,
|
||||
chunkOverlap: 300,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Map chunks back to pages for source attribution
|
||||
const chunkPageMap = new Map<number, number[]>();
|
||||
|
||||
result.chunks?.forEach((chunk, chunkIndex) => {
|
||||
const firstPage = chunk.metadata?.firstPage;
|
||||
const lastPage = chunk.metadata?.lastPage;
|
||||
|
||||
if (firstPage !== undefined && lastPage !== undefined) {
|
||||
for (let page = firstPage; page <= lastPage; page++) {
|
||||
if (!chunkPageMap.has(page)) {
|
||||
chunkPageMap.set(page, []);
|
||||
}
|
||||
chunkPageMap.get(page)!.push(chunkIndex);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Use the mapping for source attribution
|
||||
chunkPageMap.forEach((chunkIndices, pageNum) => {
|
||||
console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
|
||||
chunkIndices.forEach((idx) => {
|
||||
const chunk = result.chunks![idx];
|
||||
console.log(` Content: "${chunk.content.substring(0, 60)}..."`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
**Snippet:syntax-only** - Requires document structure parsing during extraction.
|
||||
70
docs/snippets/wasm/advanced/chunking_config.md
Normal file
70
docs/snippets/wasm/advanced/chunking_config.md
Normal file
@@ -0,0 +1,70 @@
|
||||
```typescript title="WASM - Fixed-Size Chunks"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 2000,
|
||||
chunkOverlap: 400,
|
||||
trim: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
|
||||
console.log(` Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
|
||||
console.log(` Content: "${chunk.content.substring(0, 50)}..."`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Markdown-Aware Chunking"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 1500,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk) => {
|
||||
// Content already includes heading context prepended
|
||||
console.log(chunk.content.substring(0, 80));
|
||||
console.log(
|
||||
` Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
|
||||
);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Semantic Chunking with Topic Threshold"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "semantic",
|
||||
maxChars: 1000,
|
||||
topicThreshold: 0.5, // Boundary detection at 50% topic change
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
console.log(`Generated ${result.chunks?.length} semantic chunks`);
|
||||
result.chunks?.forEach((chunk) => {
|
||||
console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
|
||||
});
|
||||
```
|
||||
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
80
docs/snippets/wasm/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```typescript title="WASM - Chunking for RAG Pipeline"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 512, // Smaller chunks for vector DB efficiency
|
||||
chunkOverlap: 100,
|
||||
trim: true,
|
||||
},
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Prepare chunks for vector database ingestion
|
||||
const ragChunks =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
|
||||
text: chunk.content,
|
||||
metadata: {
|
||||
source: result.metadata?.filename,
|
||||
chunk_index: chunk.metadata?.chunkIndex,
|
||||
total_chunks: chunk.metadata?.totalChunks,
|
||||
first_page: chunk.metadata?.firstPage,
|
||||
last_page: chunk.metadata?.lastPage,
|
||||
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
|
||||
},
|
||||
// Embedding would be added by vector DB embedding model
|
||||
})) || [];
|
||||
|
||||
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
|
||||
ragChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
|
||||
});
|
||||
|
||||
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
|
||||
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
|
||||
// id: c.id,
|
||||
// values: await embedModel.embed(c.text),
|
||||
// metadata: c.metadata,
|
||||
// })));
|
||||
```
|
||||
|
||||
```typescript title="WASM - RAG with Retrieval Context"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 768,
|
||||
chunkOverlap: 200,
|
||||
prependHeadingContext: true, // For markdown/docs
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
// Build retrieval augmented context
|
||||
interface RagDocument {
|
||||
id: string;
|
||||
query_text: string;
|
||||
context: string;
|
||||
page: number | null;
|
||||
}
|
||||
|
||||
const ragDocs: RagDocument[] =
|
||||
result.chunks?.map((chunk, idx) => ({
|
||||
id: `chunk_${idx}`,
|
||||
query_text: chunk.content, // Text to embed and search
|
||||
context: chunk.content, // Full context (includes heading)
|
||||
page: chunk.metadata?.firstPage ?? null,
|
||||
})) || [];
|
||||
|
||||
console.log(`Built RAG documents: ${ragDocs.length}`);
|
||||
```
|
||||
50
docs/snippets/wasm/advanced/custom-pipeline.ts
Normal file
50
docs/snippets/wasm/advanced/custom-pipeline.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
import type { ExtractionResult } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface ProcessingStep {
|
||||
name: string;
|
||||
process: (result: ExtractionResult) => Promise<ExtractionResult>;
|
||||
}
|
||||
|
||||
async function createExtractionPipeline(
|
||||
steps: ProcessingStep[],
|
||||
bytes: Uint8Array,
|
||||
mimeType: string,
|
||||
) {
|
||||
await initWasm();
|
||||
|
||||
let result = await extractBytes(bytes, mimeType);
|
||||
|
||||
for (const step of steps) {
|
||||
console.log(`Executing step: ${step.name}`);
|
||||
result = await step.process(result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const pipeline: ProcessingStep[] = [
|
||||
{
|
||||
name: "Text Normalization",
|
||||
process: async (result) => ({
|
||||
...result,
|
||||
content: result.content.replace(/\s+/g, " ").trim(),
|
||||
}),
|
||||
},
|
||||
{
|
||||
name: "Language Detection",
|
||||
process: async (result) => result,
|
||||
},
|
||||
{
|
||||
name: "Chunking",
|
||||
process: async (result) => result,
|
||||
},
|
||||
];
|
||||
|
||||
createExtractionPipeline(
|
||||
pipeline,
|
||||
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
|
||||
"application/pdf",
|
||||
)
|
||||
.then((r) => console.log("Pipeline complete:", r))
|
||||
.catch(console.error);
|
||||
34
docs/snippets/wasm/advanced/error-recovery.ts
Normal file
34
docs/snippets/wasm/advanced/error-recovery.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
|
||||
await initWasm();
|
||||
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
console.log(`Extraction attempt ${attempt}/${maxRetries}`);
|
||||
const result = await extractBytes(bytes, mimeType);
|
||||
console.log("Extraction successful");
|
||||
return result;
|
||||
} catch (error) {
|
||||
lastError = error instanceof Error ? error : new Error(String(error));
|
||||
console.warn(`Attempt ${attempt} failed:`, lastError.message);
|
||||
|
||||
if (attempt < maxRetries) {
|
||||
const delay = 2 ** attempt * 100;
|
||||
console.log(`Retrying in ${delay}ms...`);
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
|
||||
}
|
||||
|
||||
extractWithRetry(
|
||||
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
|
||||
"application/pdf",
|
||||
)
|
||||
.then((r) => console.log("Final result:", r))
|
||||
.catch(console.error);
|
||||
105
docs/snippets/wasm/advanced/keyword_extraction_config.md
Normal file
105
docs/snippets/wasm/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,105 @@
|
||||
```typescript title="WASM - Keyword Extraction Setup"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
// Note: Keyword extraction requires the 'keywords' feature,
|
||||
// which may not be available in all WASM builds.
|
||||
// This example shows the configuration structure.
|
||||
|
||||
const config = {
|
||||
// Extraction configuration
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Keyword extraction would be performed on the extracted text
|
||||
// using external libraries or post-processing
|
||||
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Example post-processing to extract keywords
|
||||
// (requires external keyword extraction library)
|
||||
const keywords = new Set<string>();
|
||||
const words = result.content
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
|
||||
|
||||
words.forEach((word) => {
|
||||
keywords.add(word);
|
||||
});
|
||||
|
||||
console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Keyword Filtering"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Common stopwords to exclude
|
||||
const stopwords = new Set([
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"from",
|
||||
"by",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
]);
|
||||
|
||||
// Extract and filter keywords
|
||||
const text = result.content.toLowerCase();
|
||||
const words = text.split(/\s+/);
|
||||
const keywordCounts = new Map<string, number>();
|
||||
|
||||
words.forEach((word) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 4 && !stopwords.has(cleaned)) {
|
||||
keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
// Get top keywords by frequency
|
||||
const topKeywords = Array.from(keywordCounts.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, 10)
|
||||
.map(([word, count]) => `${word} (${count})`);
|
||||
|
||||
console.log(`Top keywords: ${topKeywords.join(", ")}`);
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.
|
||||
110
docs/snippets/wasm/advanced/keyword_extraction_example.md
Normal file
110
docs/snippets/wasm/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,110 @@
|
||||
```typescript title="WASM - Extract and Score Keywords"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// TF-IDF style keyword extraction (simplified)
|
||||
interface Keyword {
|
||||
term: string;
|
||||
frequency: number;
|
||||
uniquePositions: number[];
|
||||
score: number;
|
||||
}
|
||||
|
||||
const text = result.content.toLowerCase();
|
||||
const words = text.split(/[\s\n\t]+/);
|
||||
const tokenMap = new Map<string, number[]>();
|
||||
|
||||
// Record word positions
|
||||
words.forEach((word, idx) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 3) {
|
||||
if (!tokenMap.has(cleaned)) {
|
||||
tokenMap.set(cleaned, []);
|
||||
}
|
||||
tokenMap.get(cleaned)!.push(idx);
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate keyword scores
|
||||
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
|
||||
term,
|
||||
frequency: positions.length,
|
||||
uniquePositions: positions,
|
||||
score: positions.length * Math.log(words.length / positions.length),
|
||||
}));
|
||||
|
||||
// Sort by score (TF-IDF approximation)
|
||||
keywords.sort((a, b) => b.score - a.score);
|
||||
|
||||
// Top 15 keywords
|
||||
const topKeywords = keywords.slice(0, 15);
|
||||
console.log("Top Keywords:");
|
||||
topKeywords.forEach((kw) => {
|
||||
console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Keyword Context Window"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
outputFormat: "markdown",
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface KeywordContext {
|
||||
keyword: string;
|
||||
contexts: string[];
|
||||
}
|
||||
|
||||
// Find keyword occurrences with surrounding context
|
||||
function extractKeywordContexts(
|
||||
text: string,
|
||||
keyword: string,
|
||||
contextWindow: number = 50,
|
||||
): string[] {
|
||||
const contexts: string[] = [];
|
||||
const regex = new RegExp(keyword, "gi");
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
const start = Math.max(0, match.index - contextWindow);
|
||||
const end = Math.min(text.length, match.index + keyword.length + contextWindow);
|
||||
contexts.push(text.substring(start, end));
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
// Extract context for top keywords
|
||||
const topKeywords = ["document", "analysis", "results"];
|
||||
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
|
||||
keyword: kw,
|
||||
contexts: extractKeywordContexts(result.content, kw, 40),
|
||||
}));
|
||||
|
||||
keywordContexts.forEach((kc) => {
|
||||
console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
|
||||
kc.contexts.slice(0, 2).forEach((ctx, idx) => {
|
||||
console.log(` [${idx + 1}] ...${ctx}...`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.
|
||||
84
docs/snippets/wasm/advanced/language_detection_config.md
Normal file
84
docs/snippets/wasm/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,84 @@
|
||||
```typescript title="WASM - Basic Language Detection"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.75,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Primary language: ${result.metadata?.language}`);
|
||||
console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
|
||||
console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Multi-Language Detection"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.6,
|
||||
detectMultiple: true, // Enable detection of multiple languages
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Primary language: ${result.metadata?.language}`);
|
||||
console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
|
||||
console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
|
||||
// Use detected language for downstream processing
|
||||
if (result.detectedLanguages && result.detectedLanguages.length > 1) {
|
||||
console.log("Document contains multiple languages - enable multilingual NLP processing");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Language-Specific Extraction"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
detectMultiple: false,
|
||||
},
|
||||
// Adjust extraction parameters based on detected language
|
||||
quality: {
|
||||
enableQualityProcessing: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/plain", config);
|
||||
|
||||
// Process result based on detected language
|
||||
const language = result.metadata?.language;
|
||||
console.log(`Processing document in language: ${language}`);
|
||||
|
||||
// Example: Apply language-specific rules
|
||||
const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
|
||||
en: { cleanWhitespace: true, normalizeText: true },
|
||||
zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
|
||||
ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
|
||||
ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
|
||||
};
|
||||
|
||||
const langConfig = languageConfig[language as string] || {
|
||||
cleanWhitespace: true,
|
||||
normalizeText: true,
|
||||
};
|
||||
console.log(`Language config: ${JSON.stringify(langConfig)}`);
|
||||
```
|
||||
171
docs/snippets/wasm/advanced/language_detection_multilingual.md
Normal file
171
docs/snippets/wasm/advanced/language_detection_multilingual.md
Normal file
@@ -0,0 +1,171 @@
|
||||
```typescript title="WASM - Detect and Process Multilingual Content"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.7,
|
||||
detectMultiple: true,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 800,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface MultilingualChunk {
|
||||
index: number;
|
||||
text: string;
|
||||
language: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
// Detect language for each chunk
|
||||
const multilingualChunks: MultilingualChunk[] =
|
||||
result.chunks?.map((chunk, idx) => {
|
||||
// Simple language detection based on character ranges
|
||||
const text = chunk.content;
|
||||
let detectedLang = result.metadata?.language || "en";
|
||||
let confidence = result.metadata?.languageConfidence || 0.5;
|
||||
|
||||
// Check for specific character patterns
|
||||
if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
|
||||
if (/[-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
|
||||
if (/[가-]/.test(text)) detectedLang = "ko"; // Korean
|
||||
if (/[-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
|
||||
if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
|
||||
|
||||
return {
|
||||
index: idx,
|
||||
text: text.substring(0, 50),
|
||||
language: detectedLang,
|
||||
confidence: confidence,
|
||||
};
|
||||
}) || [];
|
||||
|
||||
// Group chunks by language
|
||||
const chunksByLanguage = new Map<string, MultilingualChunk[]>();
|
||||
multilingualChunks.forEach((chunk) => {
|
||||
if (!chunksByLanguage.has(chunk.language)) {
|
||||
chunksByLanguage.set(chunk.language, []);
|
||||
}
|
||||
chunksByLanguage.get(chunk.language)!.push(chunk);
|
||||
});
|
||||
|
||||
console.log("Chunks by detected language:");
|
||||
chunksByLanguage.forEach((chunks, lang) => {
|
||||
console.log(` ${lang}: ${chunks.length} chunks`);
|
||||
chunks.slice(0, 2).forEach((c) => {
|
||||
console.log(` Chunk ${c.index}: "${c.text}..."`);
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Language-Specific Text Processing"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
detectMultiple: true,
|
||||
},
|
||||
outputFormat: "markdown",
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/plain", config);
|
||||
|
||||
// Language-specific text normalization
|
||||
interface LanguageProcessor {
|
||||
normalize: (text: string) => string;
|
||||
splitSentences: (text: string) => string[];
|
||||
}
|
||||
|
||||
const processors: Record<string, LanguageProcessor> = {
|
||||
en: {
|
||||
normalize: (t) => t.replace(/\s+/g, " ").trim(),
|
||||
splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
zh: {
|
||||
normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
|
||||
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
ja: {
|
||||
normalize: (t) => t.replace(/\s+/g, "").trim(),
|
||||
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
ar: {
|
||||
normalize: (t) => t.replace(/\s+/g, " ").trim(),
|
||||
splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
|
||||
},
|
||||
};
|
||||
|
||||
const language = result.metadata?.language || "en";
|
||||
const processor = processors[language] || processors["en"];
|
||||
|
||||
const normalized = processor.normalize(result.content);
|
||||
const sentences = processor.splitSentences(result.content);
|
||||
|
||||
console.log(`Language: ${language}`);
|
||||
console.log(`Normalized length: ${normalized.length}`);
|
||||
console.log(`Detected sentences: ${sentences.length}`);
|
||||
sentences.slice(0, 3).forEach((sent, idx) => {
|
||||
console.log(` [${idx + 1}] ${sent.substring(0, 60)}...`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Multilingual Chunking Strategy"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface LanguageChunkConfig {
|
||||
maxChars: number;
|
||||
overlap: number;
|
||||
}
|
||||
|
||||
// Different chunking strategies per language
|
||||
const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
|
||||
en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
|
||||
zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
|
||||
ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
|
||||
ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
|
||||
default: { maxChars: 512, overlap: 128 },
|
||||
};
|
||||
|
||||
// Detect language first
|
||||
const languageDetectConfig = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
minConfidence: 0.8,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
|
||||
const detectedLang = langResult.metadata?.language || "en";
|
||||
|
||||
// Re-extract with language-specific chunking
|
||||
const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
|
||||
const finalConfig = {
|
||||
languageDetection: {
|
||||
enabled: true,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: chunkConfig.maxChars,
|
||||
chunkOverlap: chunkConfig.overlap,
|
||||
},
|
||||
};
|
||||
|
||||
const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
|
||||
console.log(`Language: ${detectedLang}`);
|
||||
console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
|
||||
console.log(`Generated ${finalResult.chunks?.length} chunks`);
|
||||
```
|
||||
37
docs/snippets/wasm/advanced/memory-management.ts
Normal file
37
docs/snippets/wasm/advanced/memory-management.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function processLargeDocumentSet(files: File[]) {
|
||||
await initWasm();
|
||||
|
||||
const BATCH_SIZE = 5;
|
||||
const results: any[] = [];
|
||||
|
||||
for (let i = 0; i < files.length; i += BATCH_SIZE) {
|
||||
const batch = files.slice(i, i + BATCH_SIZE);
|
||||
|
||||
console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
|
||||
|
||||
const batchResults = await Promise.all(
|
||||
batch.map(async (file) => {
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const bytes = new Uint8Array(arrayBuffer);
|
||||
return extractBytes(bytes, file.type);
|
||||
}),
|
||||
);
|
||||
|
||||
results.push(...batchResults);
|
||||
|
||||
if (global.gc) {
|
||||
console.log("Running garbage collection");
|
||||
global.gc();
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
processLargeDocumentSet([])
|
||||
.then(() => console.log("Done"))
|
||||
.catch(console.error);
|
||||
43
docs/snippets/wasm/advanced/parallel-extraction.ts
Normal file
43
docs/snippets/wasm/advanced/parallel-extraction.ts
Normal file
@@ -0,0 +1,43 @@
|
||||
import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
|
||||
await initWasm();
|
||||
|
||||
const _runtime = detectRuntime();
|
||||
const canUseWorkers = hasWorkers();
|
||||
|
||||
if (isBrowser() && canUseWorkers) {
|
||||
return extractWithWebWorkers(documents, mimeTypes);
|
||||
}
|
||||
|
||||
return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
|
||||
}
|
||||
|
||||
function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
|
||||
const workerCount = navigator.hardwareConcurrency ?? 2;
|
||||
const workers: Worker[] = [];
|
||||
|
||||
for (let i = 0; i < workerCount; i++) {
|
||||
workers.push(new Worker("extraction-worker.js"));
|
||||
}
|
||||
|
||||
return Promise.all(
|
||||
documents.map(
|
||||
(bytes, index) =>
|
||||
new Promise((resolve, reject) => {
|
||||
const worker = workers[index % workers.length];
|
||||
worker.postMessage({ bytes, mimeType: mimeTypes[index] });
|
||||
worker.onmessage = (e) => resolve(e.data);
|
||||
worker.onerror = reject;
|
||||
}),
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
function isBrowser() {
|
||||
return typeof window !== "undefined";
|
||||
}
|
||||
|
||||
extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
|
||||
.then((results) => console.log(results))
|
||||
.catch(console.error);
|
||||
103
docs/snippets/wasm/advanced/quality_processing_config.md
Normal file
103
docs/snippets/wasm/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,103 @@
|
||||
```typescript title="WASM - Enable Quality Processing"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
console.log(`Content: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Quality score indicates text extraction quality (0.0-1.0)
|
||||
if (result.qualityScore && result.qualityScore < 0.5) {
|
||||
console.warn("Low quality extraction detected - consider OCR or alternative processing");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality Monitoring"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface ExtractionQuality {
|
||||
contentLength: number;
|
||||
qualityScore: number | null;
|
||||
assessedAs: string;
|
||||
}
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const quality: ExtractionQuality = {
|
||||
contentLength: result.content.length,
|
||||
qualityScore: result.qualityScore || null,
|
||||
assessedAs: result.qualityScore
|
||||
? result.qualityScore > 0.8
|
||||
? "high"
|
||||
: result.qualityScore > 0.5
|
||||
? "medium"
|
||||
: "low"
|
||||
: "unknown",
|
||||
};
|
||||
|
||||
console.log("Extraction Quality Report:");
|
||||
console.log(` Content size: ${quality.contentLength} bytes`);
|
||||
console.log(` Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
console.log(` Assessment: ${quality.assessedAs}`);
|
||||
|
||||
if (quality.assessedAs === "low") {
|
||||
console.log(
|
||||
" Recommendation: Review raw text for encoding issues or consider alternative extraction",
|
||||
);
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality with OCR Fallback"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
async function extractWithQualityCheck(
|
||||
bytes: Uint8Array,
|
||||
mimeType: string,
|
||||
): Promise<{ content: string; quality: number | null; method: string }> {
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, mimeType, config);
|
||||
const qualityScore = result.qualityScore || 0;
|
||||
|
||||
// If quality is low, consider text extraction failed or use OCR
|
||||
if (qualityScore < 0.5) {
|
||||
console.warn("Low quality text extraction - alternative processing recommended");
|
||||
return {
|
||||
content: result.content,
|
||||
quality: qualityScore,
|
||||
method: "degraded-text-extraction",
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
content: result.content,
|
||||
quality: qualityScore,
|
||||
method: "text-extraction",
|
||||
};
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const extracted = await extractWithQualityCheck(bytes, "application/pdf");
|
||||
|
||||
console.log(`Extraction method: ${extracted.method}`);
|
||||
console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
|
||||
console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
|
||||
```
|
||||
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,161 @@
|
||||
```typescript title="WASM - Assess Text Quality"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface TextQualityMetrics {
|
||||
contentLength: number;
|
||||
lineCount: number;
|
||||
averageLineLength: number;
|
||||
emptyLineRatio: number;
|
||||
specialCharRatio: number;
|
||||
estimatedLanguages: string[];
|
||||
}
|
||||
|
||||
function assessTextQuality(content: string): TextQualityMetrics {
|
||||
const lines = content.split(/\n+/);
|
||||
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
|
||||
const totalChars = content.length;
|
||||
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
|
||||
|
||||
// Simple language detection by character patterns
|
||||
const detectedLangs: string[] = [];
|
||||
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
|
||||
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
|
||||
if (/[-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
|
||||
if (/[가-]/.test(content)) detectedLangs.push("ko");
|
||||
|
||||
return {
|
||||
contentLength: totalChars,
|
||||
lineCount: lines.length,
|
||||
averageLineLength:
|
||||
nonEmptyLines.length > 0
|
||||
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
|
||||
: 0,
|
||||
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
|
||||
specialCharRatio: specialChars / totalChars,
|
||||
estimatedLanguages: detectedLangs,
|
||||
};
|
||||
}
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const metrics = assessTextQuality(result.content);
|
||||
|
||||
console.log("Text Quality Assessment:");
|
||||
console.log(` Length: ${metrics.contentLength} characters`);
|
||||
console.log(
|
||||
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
|
||||
);
|
||||
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
|
||||
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
|
||||
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
|
||||
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality-Based Content Filtering"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
trim: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface QualityFilteredChunk {
|
||||
index: number;
|
||||
content: string;
|
||||
quality: number;
|
||||
kept: boolean;
|
||||
}
|
||||
|
||||
// Filter chunks based on quality heuristics
|
||||
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
|
||||
const filteredChunks: QualityFilteredChunk[] =
|
||||
result.chunks?.map((chunk, idx) => {
|
||||
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
|
||||
const hasNumbers = /\d/.test(chunk.content);
|
||||
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
|
||||
|
||||
// Quality score based on content characteristics
|
||||
const contentQuality =
|
||||
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
|
||||
const kept = contentQuality >= qualityThreshold;
|
||||
|
||||
return {
|
||||
index: idx,
|
||||
content: chunk.content.substring(0, 50),
|
||||
quality: contentQuality,
|
||||
kept,
|
||||
};
|
||||
}) || [];
|
||||
|
||||
const keptChunks = filteredChunks.filter((c) => c.kept);
|
||||
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
|
||||
|
||||
keptChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Content Encoding Validation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Validate text encoding quality
|
||||
interface EncodingValidation {
|
||||
hasInvalidChars: boolean;
|
||||
invalidCharCount: number;
|
||||
replacementRatio: number;
|
||||
estimatedEncoding: string;
|
||||
}
|
||||
|
||||
function validateEncoding(content: string): EncodingValidation {
|
||||
// Check for replacement characters (U+FFFD)
|
||||
const replacementChars = (content.match(/<2F>/g) || []).length;
|
||||
const hasInvalidChars = replacementChars > 0;
|
||||
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
|
||||
|
||||
// Guess encoding based on content patterns
|
||||
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
|
||||
|
||||
return {
|
||||
hasInvalidChars,
|
||||
invalidCharCount: replacementChars,
|
||||
replacementRatio,
|
||||
estimatedEncoding,
|
||||
};
|
||||
}
|
||||
|
||||
const validation = validateEncoding(result.content);
|
||||
|
||||
console.log("Content Encoding Validation:");
|
||||
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
|
||||
console.log(` Invalid characters: ${validation.invalidCharCount}`);
|
||||
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
|
||||
console.log(
|
||||
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
|
||||
);
|
||||
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
39
docs/snippets/wasm/advanced/streaming-extraction.ts
Normal file
39
docs/snippets/wasm/advanced/streaming-extraction.ts
Normal file
@@ -0,0 +1,39 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractStreamingDocument(url: string) {
|
||||
await initWasm();
|
||||
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) throw new Error("No response body");
|
||||
|
||||
const chunks: Uint8Array[] = [];
|
||||
let totalSize = 0;
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
chunks.push(value);
|
||||
totalSize += value.length;
|
||||
console.log(`Received ${totalSize} bytes`);
|
||||
}
|
||||
|
||||
const fullBuffer = new Uint8Array(totalSize);
|
||||
let offset = 0;
|
||||
for (const chunk of chunks) {
|
||||
fullBuffer.set(chunk, offset);
|
||||
offset += chunk.length;
|
||||
}
|
||||
|
||||
console.log("Document fully received, extracting...");
|
||||
const result = await extractBytes(fullBuffer, "application/pdf");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
extractStreamingDocument("https://example.com/document.pdf")
|
||||
.then((r) => console.log(r))
|
||||
.catch(console.error);
|
||||
75
docs/snippets/wasm/advanced/token_reduction_config.md
Normal file
75
docs/snippets/wasm/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,75 @@
|
||||
```typescript title="WASM - Token Reduction Configuration"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "aggressive",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log(`Original content: ${result.content.length} characters`);
|
||||
console.log(`Preview: ${result.content.substring(0, 100)}...`);
|
||||
|
||||
// Token reduction modes:
|
||||
// - "aggressive": maximum reduction
|
||||
// - "balanced": moderate reduction
|
||||
// - "conservative": minimal reduction
|
||||
```
|
||||
|
||||
```typescript title="WASM - Token Reduction Modes"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface TokenReductionResult {
|
||||
mode: string;
|
||||
originalSize: number;
|
||||
reducedSize: number;
|
||||
reductionRatio: number;
|
||||
preview: string;
|
||||
}
|
||||
|
||||
async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
|
||||
const modes = ["conservative", "balanced", "aggressive"];
|
||||
const results: TokenReductionResult[] = [];
|
||||
|
||||
for (const mode of modes) {
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode,
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
const originalSize = result.content.length;
|
||||
const reducedSize = result.content.split(/\s+/).length;
|
||||
|
||||
results.push({
|
||||
mode,
|
||||
originalSize,
|
||||
reducedSize,
|
||||
reductionRatio: 1 - reducedSize / originalSize,
|
||||
preview: result.content.substring(0, 80),
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const modeComparison = await compareTokenReductionModes(bytes);
|
||||
|
||||
console.log("Token Reduction Mode Comparison:");
|
||||
modeComparison.forEach((r) => {
|
||||
console.log(` ${r.mode}:`);
|
||||
console.log(` Original: ${r.originalSize} chars`);
|
||||
console.log(` Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
|
||||
});
|
||||
```
|
||||
149
docs/snippets/wasm/advanced/token_reduction_example.md
Normal file
149
docs/snippets/wasm/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,149 @@
|
||||
```typescript title="WASM - Token Counting and Cost Estimation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "balanced",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Simple token counting (approximation: 1 token ≈ 4 chars)
|
||||
function estimateTokenCount(text: string): number {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
|
||||
// LLM pricing (example: GPT-4 Turbo)
|
||||
interface PricingEstimate {
|
||||
tokenCount: number;
|
||||
inputCost: number;
|
||||
outputCostEstimate: number;
|
||||
totalEstimate: number;
|
||||
}
|
||||
|
||||
const tokenCount = estimateTokenCount(result.content);
|
||||
const inputPricePerToken = 0.00001; // $0.01/1K tokens
|
||||
const outputPricePerToken = 0.00003; // $0.03/1K tokens
|
||||
|
||||
const costEstimate: PricingEstimate = {
|
||||
tokenCount,
|
||||
inputCost: tokenCount * inputPricePerToken,
|
||||
outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
|
||||
totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
|
||||
};
|
||||
|
||||
console.log("Token and Cost Analysis:");
|
||||
console.log(` Estimated tokens: ${costEstimate.tokenCount}`);
|
||||
console.log(` Input cost: $${costEstimate.inputCost.toFixed(6)}`);
|
||||
console.log(` Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
|
||||
console.log(` Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Token Reduction for Context Windows"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface ContextWindowFit {
|
||||
contentLength: number;
|
||||
estimatedTokens: number;
|
||||
fitsInWindow: boolean;
|
||||
utilization: number;
|
||||
}
|
||||
|
||||
function checkContextWindowFit(
|
||||
content: string,
|
||||
contextWindowSize: number = 4096,
|
||||
): ContextWindowFit {
|
||||
const estimatedTokens = Math.ceil(content.length / 4);
|
||||
const fitsInWindow = estimatedTokens < contextWindowSize;
|
||||
const utilization = estimatedTokens / contextWindowSize;
|
||||
|
||||
return {
|
||||
contentLength: content.length,
|
||||
estimatedTokens,
|
||||
fitsInWindow,
|
||||
utilization,
|
||||
};
|
||||
}
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "aggressive", // Use aggressive mode for large documents
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const contextFit = checkContextWindowFit(result.content, 4096);
|
||||
|
||||
console.log("Context Window Analysis:");
|
||||
console.log(` Content: ${contextFit.contentLength} characters`);
|
||||
console.log(` Tokens (est.): ${contextFit.estimatedTokens}`);
|
||||
console.log(` Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
|
||||
console.log(` Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
|
||||
|
||||
if (!contextFit.fitsInWindow) {
|
||||
console.log(" Note: Consider chunking or more aggressive token reduction");
|
||||
}
|
||||
```
|
||||
|
||||
```typescript title="WASM - Selective Token Preservation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
tokenReduction: {
|
||||
mode: "balanced",
|
||||
preserveImportantWords: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Extract important terms manually
|
||||
interface ImportantTerm {
|
||||
term: string;
|
||||
frequency: number;
|
||||
importance: number;
|
||||
}
|
||||
|
||||
function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
|
||||
const words = content.toLowerCase().split(/\s+/);
|
||||
const frequencyMap = new Map<string, number>();
|
||||
|
||||
words.forEach((word) => {
|
||||
const cleaned = word.replace(/[^\w]/g, "");
|
||||
if (cleaned.length > 5) {
|
||||
// Only consider longer words
|
||||
frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(frequencyMap.entries())
|
||||
.filter(([_, freq]) => freq >= threshold)
|
||||
.map(([term, freq]) => ({
|
||||
term,
|
||||
frequency: freq,
|
||||
importance: Math.log(freq) * (term.length / 10),
|
||||
}))
|
||||
.sort((a, b) => b.importance - a.importance)
|
||||
.slice(0, 20);
|
||||
}
|
||||
|
||||
const importantTerms = extractImportantTerms(result.content);
|
||||
|
||||
console.log("Important Terms (likely preserved by token reduction):");
|
||||
importantTerms.forEach((t) => {
|
||||
console.log(` "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
|
||||
});
|
||||
```
|
||||
57
docs/snippets/wasm/advanced/worker-extraction.ts
Normal file
57
docs/snippets/wasm/advanced/worker-extraction.ts
Normal file
@@ -0,0 +1,57 @@
|
||||
class ExtractionWorker {
|
||||
private workerPool: Worker[] = [];
|
||||
private taskQueue: Array<{
|
||||
bytes: Uint8Array;
|
||||
mimeType: string;
|
||||
resolve: (value: any) => void;
|
||||
reject: (error: any) => void;
|
||||
}> = [];
|
||||
|
||||
constructor(workerCount?: number) {
|
||||
const count =
|
||||
workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
|
||||
for (let i = 0; i < count; i++) {
|
||||
const worker = new Worker("extraction-worker.js");
|
||||
worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
|
||||
worker.onerror = (e) => this.handleWorkerError(worker, e);
|
||||
this.workerPool.push(worker);
|
||||
}
|
||||
}
|
||||
|
||||
async extract(bytes: Uint8Array, mimeType: string) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.taskQueue.push({ bytes, mimeType, resolve, reject });
|
||||
this.processTasks();
|
||||
});
|
||||
}
|
||||
|
||||
private processTasks() {
|
||||
const availableWorker = this.workerPool[0];
|
||||
const task = this.taskQueue.shift();
|
||||
|
||||
if (!task || !availableWorker) return;
|
||||
|
||||
availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
|
||||
this.workerPool.push(this.workerPool.shift()!);
|
||||
}
|
||||
|
||||
private handleWorkerMessage(_worker: Worker, result: any) {
|
||||
const task = this.taskQueue.shift();
|
||||
if (task) {
|
||||
task.resolve(result);
|
||||
this.processTasks();
|
||||
}
|
||||
}
|
||||
|
||||
private handleWorkerError(_worker: Worker, error: any) {
|
||||
const task = this.taskQueue.shift();
|
||||
if (task) {
|
||||
task.reject(error);
|
||||
this.processTasks();
|
||||
}
|
||||
}
|
||||
|
||||
terminate() {
|
||||
this.workerPool.forEach((w) => w.terminate());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user