Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,44 @@
```typescript title="WASM"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 1500,
chunkOverlap: 300,
},
includeDocumentStructure: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Map chunks back to pages for source attribution
const chunkPageMap = new Map<number, number[]>();
result.chunks?.forEach((chunk, chunkIndex) => {
const firstPage = chunk.metadata?.firstPage;
const lastPage = chunk.metadata?.lastPage;
if (firstPage !== undefined && lastPage !== undefined) {
for (let page = firstPage; page <= lastPage; page++) {
if (!chunkPageMap.has(page)) {
chunkPageMap.set(page, []);
}
chunkPageMap.get(page)!.push(chunkIndex);
}
}
});
// Use the mapping for source attribution
chunkPageMap.forEach((chunkIndices, pageNum) => {
console.log(`Page ${pageNum}: Chunks ${chunkIndices.join(", ")}`);
chunkIndices.forEach((idx) => {
const chunk = result.chunks![idx];
console.log(` Content: "${chunk.content.substring(0, 60)}..."`);
});
});
```
**Snippet:syntax-only** - Requires document structure parsing during extraction.

View File

@@ -0,0 +1,70 @@
```typescript title="WASM - Fixed-Size Chunks"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 2000,
chunkOverlap: 400,
trim: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${chunk.metadata?.chunkIndex}/${chunk.metadata?.totalChunks}`);
console.log(` Position: ${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`);
console.log(` Content: "${chunk.content.substring(0, 50)}..."`);
});
```
```typescript title="WASM - Markdown-Aware Chunking"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
chunkerType: "markdown",
maxChars: 1500,
prependHeadingContext: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
result.chunks?.forEach((chunk) => {
// Content already includes heading context prepended
console.log(chunk.content.substring(0, 80));
console.log(
` Heading path: ${chunk.metadata?.headingContext?.headings?.map((h) => `${"#".repeat(h.level)} ${h.text}`).join(" > ")}`,
);
});
```
```typescript title="WASM - Semantic Chunking with Topic Threshold"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
chunkerType: "semantic",
maxChars: 1000,
topicThreshold: 0.5, // Boundary detection at 50% topic change
chunkOverlap: 100,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
console.log(`Generated ${result.chunks?.length} semantic chunks`);
result.chunks?.forEach((chunk) => {
console.log(`Chunk ${chunk.metadata?.chunkIndex}: ${chunk.content.length} chars`);
});
```

View File

@@ -0,0 +1,80 @@
```typescript title="WASM - Chunking for RAG Pipeline"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 512, // Smaller chunks for vector DB efficiency
chunkOverlap: 100,
trim: true,
},
includeDocumentStructure: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Prepare chunks for vector database ingestion
const ragChunks =
result.chunks?.map((chunk, idx) => ({
id: `${result.metadata?.filename || "doc"}_chunk_${idx}`,
text: chunk.content,
metadata: {
source: result.metadata?.filename,
chunk_index: chunk.metadata?.chunkIndex,
total_chunks: chunk.metadata?.totalChunks,
first_page: chunk.metadata?.firstPage,
last_page: chunk.metadata?.lastPage,
byte_position: `${chunk.metadata?.byteStart}-${chunk.metadata?.byteEnd}`,
},
// Embedding would be added by vector DB embedding model
})) || [];
console.log(`Prepared ${ragChunks.length} chunks for RAG ingestion`);
ragChunks.slice(0, 3).forEach((c) => {
console.log(`Chunk ${c.id}: ${c.text.substring(0, 50)}...`);
});
// Example: Send to vector database (e.g., Pinecone, Weaviate, Milvus)
// const vectorResults = await vectorDb.upsert(ragChunks.map(c => ({
// id: c.id,
// values: await embedModel.embed(c.text),
// metadata: c.metadata,
// })));
```
```typescript title="WASM - RAG with Retrieval Context"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
chunking: {
maxChars: 768,
chunkOverlap: 200,
prependHeadingContext: true, // For markdown/docs
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/markdown", config);
// Build retrieval augmented context
interface RagDocument {
id: string;
query_text: string;
context: string;
page: number | null;
}
const ragDocs: RagDocument[] =
result.chunks?.map((chunk, idx) => ({
id: `chunk_${idx}`,
query_text: chunk.content, // Text to embed and search
context: chunk.content, // Full context (includes heading)
page: chunk.metadata?.firstPage ?? null,
})) || [];
console.log(`Built RAG documents: ${ragDocs.length}`);
```

View File

@@ -0,0 +1,50 @@
import type { ExtractionResult } from "@kreuzberg/wasm";
import { extractBytes, initWasm } from "@kreuzberg/wasm";
interface ProcessingStep {
name: string;
process: (result: ExtractionResult) => Promise<ExtractionResult>;
}
async function createExtractionPipeline(
steps: ProcessingStep[],
bytes: Uint8Array,
mimeType: string,
) {
await initWasm();
let result = await extractBytes(bytes, mimeType);
for (const step of steps) {
console.log(`Executing step: ${step.name}`);
result = await step.process(result);
}
return result;
}
const pipeline: ProcessingStep[] = [
{
name: "Text Normalization",
process: async (result) => ({
...result,
content: result.content.replace(/\s+/g, " ").trim(),
}),
},
{
name: "Language Detection",
process: async (result) => result,
},
{
name: "Chunking",
process: async (result) => result,
},
];
createExtractionPipeline(
pipeline,
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
"application/pdf",
)
.then((r) => console.log("Pipeline complete:", r))
.catch(console.error);

View File

@@ -0,0 +1,34 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractWithRetry(bytes: Uint8Array, mimeType: string, maxRetries: number = 3) {
await initWasm();
let lastError: Error | null = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
console.log(`Extraction attempt ${attempt}/${maxRetries}`);
const result = await extractBytes(bytes, mimeType);
console.log("Extraction successful");
return result;
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
console.warn(`Attempt ${attempt} failed:`, lastError.message);
if (attempt < maxRetries) {
const delay = 2 ** attempt * 100;
console.log(`Retrying in ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
}
throw new Error(`Extraction failed after ${maxRetries} attempts: ${lastError?.message}`);
}
extractWithRetry(
new Uint8Array(await fetch("doc.pdf").then((r) => r.arrayBuffer())),
"application/pdf",
)
.then((r) => console.log("Final result:", r))
.catch(console.error);

View File

@@ -0,0 +1,105 @@
```typescript title="WASM - Keyword Extraction Setup"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
// Note: Keyword extraction requires the 'keywords' feature,
// which may not be available in all WASM builds.
// This example shows the configuration structure.
const config = {
// Extraction configuration
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Keyword extraction would be performed on the extracted text
// using external libraries or post-processing
console.log(`Extracted text: ${result.content.substring(0, 100)}...`);
// Example post-processing to extract keywords
// (requires external keyword extraction library)
const keywords = new Set<string>();
const words = result.content
.toLowerCase()
.split(/\s+/)
.filter((w) => w.length > 4); // Simple heuristic: words > 4 chars
words.forEach((word) => {
keywords.add(word);
});
console.log(`Extracted keywords: ${Array.from(keywords).slice(0, 10).join(", ")}`);
```
```typescript title="WASM - Keyword Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Common stopwords to exclude
const stopwords = new Set([
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"from",
"by",
"is",
"are",
"was",
"were",
"be",
"been",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
]);
// Extract and filter keywords
const text = result.content.toLowerCase();
const words = text.split(/\s+/);
const keywordCounts = new Map<string, number>();
words.forEach((word) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 4 && !stopwords.has(cleaned)) {
keywordCounts.set(cleaned, (keywordCounts.get(cleaned) || 0) + 1);
}
});
// Get top keywords by frequency
const topKeywords = Array.from(keywordCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, 10)
.map(([word, count]) => `${word} (${count})`);
console.log(`Top keywords: ${topKeywords.join(", ")}`);
```
<!-- snippet:syntax-only --> - Native keyword extraction requires the `keywords` feature which may not be compiled into WASM builds.

View File

@@ -0,0 +1,110 @@
```typescript title="WASM - Extract and Score Keywords"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// TF-IDF style keyword extraction (simplified)
interface Keyword {
term: string;
frequency: number;
uniquePositions: number[];
score: number;
}
const text = result.content.toLowerCase();
const words = text.split(/[\s\n\t]+/);
const tokenMap = new Map<string, number[]>();
// Record word positions
words.forEach((word, idx) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 3) {
if (!tokenMap.has(cleaned)) {
tokenMap.set(cleaned, []);
}
tokenMap.get(cleaned)!.push(idx);
}
});
// Calculate keyword scores
const keywords: Keyword[] = Array.from(tokenMap.entries()).map(([term, positions]) => ({
term,
frequency: positions.length,
uniquePositions: positions,
score: positions.length * Math.log(words.length / positions.length),
}));
// Sort by score (TF-IDF approximation)
keywords.sort((a, b) => b.score - a.score);
// Top 15 keywords
const topKeywords = keywords.slice(0, 15);
console.log("Top Keywords:");
topKeywords.forEach((kw) => {
console.log(` ${kw.term}: ${kw.frequency} occurrences (score: ${kw.score.toFixed(2)})`);
});
```
```typescript title="WASM - Keyword Context Window"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
outputFormat: "markdown",
chunking: {
maxChars: 1000,
chunkOverlap: 200,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface KeywordContext {
keyword: string;
contexts: string[];
}
// Find keyword occurrences with surrounding context
function extractKeywordContexts(
text: string,
keyword: string,
contextWindow: number = 50,
): string[] {
const contexts: string[] = [];
const regex = new RegExp(keyword, "gi");
let match;
while ((match = regex.exec(text)) !== null) {
const start = Math.max(0, match.index - contextWindow);
const end = Math.min(text.length, match.index + keyword.length + contextWindow);
contexts.push(text.substring(start, end));
}
return contexts;
}
// Extract context for top keywords
const topKeywords = ["document", "analysis", "results"];
const keywordContexts: KeywordContext[] = topKeywords.map((kw) => ({
keyword: kw,
contexts: extractKeywordContexts(result.content, kw, 40),
}));
keywordContexts.forEach((kc) => {
console.log(`\n"${kc.keyword}" appears ${kc.contexts.length} times:`);
kc.contexts.slice(0, 2).forEach((ctx, idx) => {
console.log(` [${idx + 1}] ...${ctx}...`);
});
});
```
<!-- snippet:syntax-only --> - Keyword extraction without native YAKE/RAKE requires manual text processing.

View File

@@ -0,0 +1,84 @@
```typescript title="WASM - Basic Language Detection"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.75,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
```
```typescript title="WASM - Multi-Language Detection"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.6,
detectMultiple: true, // Enable detection of multiple languages
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);
// Use detected language for downstream processing
if (result.detectedLanguages && result.detectedLanguages.length > 1) {
console.log("Document contains multiple languages - enable multilingual NLP processing");
}
```
```typescript title="WASM - Language-Specific Extraction"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
// Adjust extraction parameters based on detected language
quality: {
enableQualityProcessing: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/plain", config);
// Process result based on detected language
const language = result.metadata?.language;
console.log(`Processing document in language: ${language}`);
// Example: Apply language-specific rules
const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
en: { cleanWhitespace: true, normalizeText: true },
zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
};
const langConfig = languageConfig[language as string] || {
cleanWhitespace: true,
normalizeText: true,
};
console.log(`Language config: ${JSON.stringify(langConfig)}`);
```

View File

@@ -0,0 +1,171 @@
```typescript title="WASM - Detect and Process Multilingual Content"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.7,
detectMultiple: true,
},
chunking: {
maxChars: 800,
chunkOverlap: 200,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface MultilingualChunk {
index: number;
text: string;
language: string;
confidence: number;
}
// Detect language for each chunk
const multilingualChunks: MultilingualChunk[] =
result.chunks?.map((chunk, idx) => {
// Simple language detection based on character ranges
const text = chunk.content;
let detectedLang = result.metadata?.language || "en";
let confidence = result.metadata?.languageConfidence || 0.5;
// Check for specific character patterns
if (/[一-鿿]/.test(text)) detectedLang = "zh"; // Chinese
if (/[぀-ゟ゠-ヿ]/.test(text)) detectedLang = "ja"; // Japanese
if (/[가-힯]/.test(text)) detectedLang = "ko"; // Korean
if (/[؀-ۿ]/.test(text)) detectedLang = "ar"; // Arabic
if (/[Ѐ-ӿ]/.test(text)) detectedLang = "ru"; // Russian
return {
index: idx,
text: text.substring(0, 50),
language: detectedLang,
confidence: confidence,
};
}) || [];
// Group chunks by language
const chunksByLanguage = new Map<string, MultilingualChunk[]>();
multilingualChunks.forEach((chunk) => {
if (!chunksByLanguage.has(chunk.language)) {
chunksByLanguage.set(chunk.language, []);
}
chunksByLanguage.get(chunk.language)!.push(chunk);
});
console.log("Chunks by detected language:");
chunksByLanguage.forEach((chunks, lang) => {
console.log(` ${lang}: ${chunks.length} chunks`);
chunks.slice(0, 2).forEach((c) => {
console.log(` Chunk ${c.index}: "${c.text}..."`);
});
});
```
```typescript title="WASM - Language-Specific Text Processing"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
languageDetection: {
enabled: true,
detectMultiple: true,
},
outputFormat: "markdown",
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/plain", config);
// Language-specific text normalization
interface LanguageProcessor {
normalize: (text: string) => string;
splitSentences: (text: string) => string[];
}
const processors: Record<string, LanguageProcessor> = {
en: {
normalize: (t) => t.replace(/\s+/g, " ").trim(),
splitSentences: (t) => t.split(/[.!?]+/).filter((s) => s.length > 0),
},
zh: {
normalize: (t) => t.replace(/\s+/g, "").trim(), // CJK: no word spacing
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
},
ja: {
normalize: (t) => t.replace(/\s+/g, "").trim(),
splitSentences: (t) => t.split(/[。!?]+/).filter((s) => s.length > 0),
},
ar: {
normalize: (t) => t.replace(/\s+/g, " ").trim(),
splitSentences: (t) => t.split(/[.!?،؟]+/).filter((s) => s.length > 0),
},
};
const language = result.metadata?.language || "en";
const processor = processors[language] || processors["en"];
const normalized = processor.normalize(result.content);
const sentences = processor.splitSentences(result.content);
console.log(`Language: ${language}`);
console.log(`Normalized length: ${normalized.length}`);
console.log(`Detected sentences: ${sentences.length}`);
sentences.slice(0, 3).forEach((sent, idx) => {
console.log(` [${idx + 1}] ${sent.substring(0, 60)}...`);
});
```
```typescript title="WASM - Multilingual Chunking Strategy"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface LanguageChunkConfig {
maxChars: number;
overlap: number;
}
// Different chunking strategies per language
const chunkConfigByLanguage: Record<string, LanguageChunkConfig> = {
en: { maxChars: 512, overlap: 128 }, // English: word-based chunking
zh: { maxChars: 256, overlap: 64 }, // Chinese: smaller chunks due to character density
ja: { maxChars: 300, overlap: 75 }, // Japanese: medium chunks
ar: { maxChars: 400, overlap: 100 }, // Arabic: larger chunks for context
default: { maxChars: 512, overlap: 128 },
};
// Detect language first
const languageDetectConfig = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
},
};
const bytes = new Uint8Array(buffer);
const langResult = await extractBytes(bytes, "text/plain", languageDetectConfig);
const detectedLang = langResult.metadata?.language || "en";
// Re-extract with language-specific chunking
const chunkConfig = chunkConfigByLanguage[detectedLang] || chunkConfigByLanguage["default"];
const finalConfig = {
languageDetection: {
enabled: true,
},
chunking: {
maxChars: chunkConfig.maxChars,
chunkOverlap: chunkConfig.overlap,
},
};
const finalResult = await extractBytes(bytes, "text/plain", finalConfig);
console.log(`Language: ${detectedLang}`);
console.log(`Chunking strategy: maxChars=${chunkConfig.maxChars}, overlap=${chunkConfig.overlap}`);
console.log(`Generated ${finalResult.chunks?.length} chunks`);
```

View File

@@ -0,0 +1,37 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function processLargeDocumentSet(files: File[]) {
await initWasm();
const BATCH_SIZE = 5;
const results: any[] = [];
for (let i = 0; i < files.length; i += BATCH_SIZE) {
const batch = files.slice(i, i + BATCH_SIZE);
console.log(`Processing batch ${Math.floor(i / BATCH_SIZE) + 1}`);
const batchResults = await Promise.all(
batch.map(async (file) => {
const arrayBuffer = await file.arrayBuffer();
const bytes = new Uint8Array(arrayBuffer);
return extractBytes(bytes, file.type);
}),
);
results.push(...batchResults);
if (global.gc) {
console.log("Running garbage collection");
global.gc();
}
await new Promise((resolve) => setTimeout(resolve, 100));
}
return results;
}
processLargeDocumentSet([])
.then(() => console.log("Done"))
.catch(console.error);

View File

@@ -0,0 +1,43 @@
import { detectRuntime, extractBytes, hasWorkers, initWasm } from "@kreuzberg/wasm";
async function extractInParallel(documents: Uint8Array[], mimeTypes: string[]) {
await initWasm();
const _runtime = detectRuntime();
const canUseWorkers = hasWorkers();
if (isBrowser() && canUseWorkers) {
return extractWithWebWorkers(documents, mimeTypes);
}
return Promise.all(documents.map((bytes, index) => extractBytes(bytes, mimeTypes[index])));
}
function extractWithWebWorkers(documents: Uint8Array[], mimeTypes: string[]) {
const workerCount = navigator.hardwareConcurrency ?? 2;
const workers: Worker[] = [];
for (let i = 0; i < workerCount; i++) {
workers.push(new Worker("extraction-worker.js"));
}
return Promise.all(
documents.map(
(bytes, index) =>
new Promise((resolve, reject) => {
const worker = workers[index % workers.length];
worker.postMessage({ bytes, mimeType: mimeTypes[index] });
worker.onmessage = (e) => resolve(e.data);
worker.onerror = reject;
}),
),
);
}
function isBrowser() {
return typeof window !== "undefined";
}
extractInParallel([new Uint8Array([1, 2, 3])], ["application/pdf"])
.then((results) => console.log(results))
.catch(console.error);

View File

@@ -0,0 +1,103 @@
```typescript title="WASM - Enable Quality Processing"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
console.log(`Content: ${result.content.substring(0, 100)}...`);
// Quality score indicates text extraction quality (0.0-1.0)
if (result.qualityScore && result.qualityScore < 0.5) {
console.warn("Low quality extraction detected - consider OCR or alternative processing");
}
```
```typescript title="WASM - Quality Monitoring"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface ExtractionQuality {
contentLength: number;
qualityScore: number | null;
assessedAs: string;
}
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const quality: ExtractionQuality = {
contentLength: result.content.length,
qualityScore: result.qualityScore || null,
assessedAs: result.qualityScore
? result.qualityScore > 0.8
? "high"
: result.qualityScore > 0.5
? "medium"
: "low"
: "unknown",
};
console.log("Extraction Quality Report:");
console.log(` Content size: ${quality.contentLength} bytes`);
console.log(` Quality score: ${quality.qualityScore?.toFixed(3) || "N/A"}`);
console.log(` Assessment: ${quality.assessedAs}`);
if (quality.assessedAs === "low") {
console.log(
" Recommendation: Review raw text for encoding issues or consider alternative extraction",
);
}
```
```typescript title="WASM - Quality with OCR Fallback"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
async function extractWithQualityCheck(
bytes: Uint8Array,
mimeType: string,
): Promise<{ content: string; quality: number | null; method: string }> {
const config = {
enableQualityProcessing: true,
};
const result = await extractBytes(bytes, mimeType, config);
const qualityScore = result.qualityScore || 0;
// If quality is low, consider text extraction failed or use OCR
if (qualityScore < 0.5) {
console.warn("Low quality text extraction - alternative processing recommended");
return {
content: result.content,
quality: qualityScore,
method: "degraded-text-extraction",
};
}
return {
content: result.content,
quality: qualityScore,
method: "text-extraction",
};
}
const bytes = new Uint8Array(buffer);
const extracted = await extractWithQualityCheck(bytes, "application/pdf");
console.log(`Extraction method: ${extracted.method}`);
console.log(`Quality score: ${extracted.quality?.toFixed(3)}`);
console.log(`Content preview: ${extracted.content.substring(0, 80)}...`);
```

View File

@@ -0,0 +1,161 @@
```typescript title="WASM - Assess Text Quality"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface TextQualityMetrics {
contentLength: number;
lineCount: number;
averageLineLength: number;
emptyLineRatio: number;
specialCharRatio: number;
estimatedLanguages: string[];
}
function assessTextQuality(content: string): TextQualityMetrics {
const lines = content.split(/\n+/);
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
const totalChars = content.length;
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
// Simple language detection by character patterns
const detectedLangs: string[] = [];
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
if (/[가-힯]/.test(content)) detectedLangs.push("ko");
return {
contentLength: totalChars,
lineCount: lines.length,
averageLineLength:
nonEmptyLines.length > 0
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
: 0,
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
specialCharRatio: specialChars / totalChars,
estimatedLanguages: detectedLangs,
};
}
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const metrics = assessTextQuality(result.content);
console.log("Text Quality Assessment:");
console.log(` Length: ${metrics.contentLength} characters`);
console.log(
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
);
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```
```typescript title="WASM - Quality-Based Content Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
chunking: {
maxChars: 1000,
chunkOverlap: 200,
trim: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface QualityFilteredChunk {
index: number;
content: string;
quality: number;
kept: boolean;
}
// Filter chunks based on quality heuristics
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
const filteredChunks: QualityFilteredChunk[] =
result.chunks?.map((chunk, idx) => {
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
const hasNumbers = /\d/.test(chunk.content);
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
// Quality score based on content characteristics
const contentQuality =
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
const kept = contentQuality >= qualityThreshold;
return {
index: idx,
content: chunk.content.substring(0, 50),
quality: contentQuality,
kept,
};
}) || [];
const keptChunks = filteredChunks.filter((c) => c.kept);
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
keptChunks.slice(0, 3).forEach((c) => {
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
});
```
```typescript title="WASM - Content Encoding Validation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Validate text encoding quality
interface EncodingValidation {
hasInvalidChars: boolean;
invalidCharCount: number;
replacementRatio: number;
estimatedEncoding: string;
}
function validateEncoding(content: string): EncodingValidation {
// Check for replacement characters (U+FFFD)
const replacementChars = (content.match(/<2F>/g) || []).length;
const hasInvalidChars = replacementChars > 0;
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
// Guess encoding based on content patterns
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
return {
hasInvalidChars,
invalidCharCount: replacementChars,
replacementRatio,
estimatedEncoding,
};
}
const validation = validateEncoding(result.content);
console.log("Content Encoding Validation:");
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
console.log(` Invalid characters: ${validation.invalidCharCount}`);
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
console.log(
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
);
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```

View File

@@ -0,0 +1,39 @@
import { extractBytes, initWasm } from "@kreuzberg/wasm";
async function extractStreamingDocument(url: string) {
await initWasm();
const response = await fetch(url);
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const reader = response.body?.getReader();
if (!reader) throw new Error("No response body");
const chunks: Uint8Array[] = [];
let totalSize = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
totalSize += value.length;
console.log(`Received ${totalSize} bytes`);
}
const fullBuffer = new Uint8Array(totalSize);
let offset = 0;
for (const chunk of chunks) {
fullBuffer.set(chunk, offset);
offset += chunk.length;
}
console.log("Document fully received, extracting...");
const result = await extractBytes(fullBuffer, "application/pdf");
return result;
}
extractStreamingDocument("https://example.com/document.pdf")
.then((r) => console.log(r))
.catch(console.error);

View File

@@ -0,0 +1,75 @@
```typescript title="WASM - Token Reduction Configuration"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "aggressive",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(`Original content: ${result.content.length} characters`);
console.log(`Preview: ${result.content.substring(0, 100)}...`);
// Token reduction modes:
// - "aggressive": maximum reduction
// - "balanced": moderate reduction
// - "conservative": minimal reduction
```
```typescript title="WASM - Token Reduction Modes"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface TokenReductionResult {
mode: string;
originalSize: number;
reducedSize: number;
reductionRatio: number;
preview: string;
}
async function compareTokenReductionModes(bytes: Uint8Array): Promise<TokenReductionResult[]> {
const modes = ["conservative", "balanced", "aggressive"];
const results: TokenReductionResult[] = [];
for (const mode of modes) {
const config = {
tokenReduction: {
mode,
preserveImportantWords: true,
},
};
const result = await extractBytes(bytes, "application/pdf", config);
const originalSize = result.content.length;
const reducedSize = result.content.split(/\s+/).length;
results.push({
mode,
originalSize,
reducedSize,
reductionRatio: 1 - reducedSize / originalSize,
preview: result.content.substring(0, 80),
});
}
return results;
}
const bytes = new Uint8Array(buffer);
const modeComparison = await compareTokenReductionModes(bytes);
console.log("Token Reduction Mode Comparison:");
modeComparison.forEach((r) => {
console.log(` ${r.mode}:`);
console.log(` Original: ${r.originalSize} chars`);
console.log(` Reduction: ${(r.reductionRatio * 100).toFixed(1)}%`);
});
```

View File

@@ -0,0 +1,149 @@
```typescript title="WASM - Token Counting and Cost Estimation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "balanced",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Simple token counting (approximation: 1 token ≈ 4 chars)
function estimateTokenCount(text: string): number {
return Math.ceil(text.length / 4);
}
// LLM pricing (example: GPT-4 Turbo)
interface PricingEstimate {
tokenCount: number;
inputCost: number;
outputCostEstimate: number;
totalEstimate: number;
}
const tokenCount = estimateTokenCount(result.content);
const inputPricePerToken = 0.00001; // $0.01/1K tokens
const outputPricePerToken = 0.00003; // $0.03/1K tokens
const costEstimate: PricingEstimate = {
tokenCount,
inputCost: tokenCount * inputPricePerToken,
outputCostEstimate: tokenCount * outputPricePerToken * 0.5, // Assume output is ~50% of input
totalEstimate: tokenCount * inputPricePerToken + tokenCount * outputPricePerToken * 0.5,
};
console.log("Token and Cost Analysis:");
console.log(` Estimated tokens: ${costEstimate.tokenCount}`);
console.log(` Input cost: $${costEstimate.inputCost.toFixed(6)}`);
console.log(` Output cost (est.): $${costEstimate.outputCostEstimate.toFixed(6)}`);
console.log(` Total cost (est.): $${costEstimate.totalEstimate.toFixed(6)}`);
```
```typescript title="WASM - Token Reduction for Context Windows"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface ContextWindowFit {
contentLength: number;
estimatedTokens: number;
fitsInWindow: boolean;
utilization: number;
}
function checkContextWindowFit(
content: string,
contextWindowSize: number = 4096,
): ContextWindowFit {
const estimatedTokens = Math.ceil(content.length / 4);
const fitsInWindow = estimatedTokens < contextWindowSize;
const utilization = estimatedTokens / contextWindowSize;
return {
contentLength: content.length,
estimatedTokens,
fitsInWindow,
utilization,
};
}
const config = {
tokenReduction: {
mode: "aggressive", // Use aggressive mode for large documents
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const contextFit = checkContextWindowFit(result.content, 4096);
console.log("Context Window Analysis:");
console.log(` Content: ${contextFit.contentLength} characters`);
console.log(` Tokens (est.): ${contextFit.estimatedTokens}`);
console.log(` Fits in 4K context: ${contextFit.fitsInWindow ? "YES" : "NO"}`);
console.log(` Utilization: ${(contextFit.utilization * 100).toFixed(1)}%`);
if (!contextFit.fitsInWindow) {
console.log(" Note: Consider chunking or more aggressive token reduction");
}
```
```typescript title="WASM - Selective Token Preservation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
tokenReduction: {
mode: "balanced",
preserveImportantWords: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Extract important terms manually
interface ImportantTerm {
term: string;
frequency: number;
importance: number;
}
function extractImportantTerms(content: string, threshold: number = 3): ImportantTerm[] {
const words = content.toLowerCase().split(/\s+/);
const frequencyMap = new Map<string, number>();
words.forEach((word) => {
const cleaned = word.replace(/[^\w]/g, "");
if (cleaned.length > 5) {
// Only consider longer words
frequencyMap.set(cleaned, (frequencyMap.get(cleaned) || 0) + 1);
}
});
return Array.from(frequencyMap.entries())
.filter(([_, freq]) => freq >= threshold)
.map(([term, freq]) => ({
term,
frequency: freq,
importance: Math.log(freq) * (term.length / 10),
}))
.sort((a, b) => b.importance - a.importance)
.slice(0, 20);
}
const importantTerms = extractImportantTerms(result.content);
console.log("Important Terms (likely preserved by token reduction):");
importantTerms.forEach((t) => {
console.log(` "${t.term}": ${t.frequency} occurrences (importance: ${t.importance.toFixed(2)})`);
});
```

View File

@@ -0,0 +1,57 @@
class ExtractionWorker {
private workerPool: Worker[] = [];
private taskQueue: Array<{
bytes: Uint8Array;
mimeType: string;
resolve: (value: any) => void;
reject: (error: any) => void;
}> = [];
constructor(workerCount?: number) {
const count =
workerCount ?? (typeof navigator !== "undefined" ? (navigator.hardwareConcurrency ?? 2) : 2);
for (let i = 0; i < count; i++) {
const worker = new Worker("extraction-worker.js");
worker.onmessage = (e) => this.handleWorkerMessage(worker, e.data);
worker.onerror = (e) => this.handleWorkerError(worker, e);
this.workerPool.push(worker);
}
}
async extract(bytes: Uint8Array, mimeType: string) {
return new Promise((resolve, reject) => {
this.taskQueue.push({ bytes, mimeType, resolve, reject });
this.processTasks();
});
}
private processTasks() {
const availableWorker = this.workerPool[0];
const task = this.taskQueue.shift();
if (!task || !availableWorker) return;
availableWorker.postMessage({ bytes: task.bytes, mimeType: task.mimeType });
this.workerPool.push(this.workerPool.shift()!);
}
private handleWorkerMessage(_worker: Worker, result: any) {
const task = this.taskQueue.shift();
if (task) {
task.resolve(result);
this.processTasks();
}
}
private handleWorkerError(_worker: Worker, error: any) {
const task = this.taskQueue.shift();
if (task) {
task.reject(error);
this.processTasks();
}
}
terminate() {
this.workerPool.forEach((w) => w.terminate());
}
}