```typescript title="WASM - Assess Text Quality" import init, { extractBytes } from "kreuzberg-wasm"; await init(); interface TextQualityMetrics { contentLength: number; lineCount: number; averageLineLength: number; emptyLineRatio: number; specialCharRatio: number; estimatedLanguages: string[]; } function assessTextQuality(content: string): TextQualityMetrics { const lines = content.split(/\n+/); const nonEmptyLines = lines.filter((l) => l.trim().length > 0); const totalChars = content.length; const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length; // Simple language detection by character patterns const detectedLangs: string[] = []; if (/[a-zA-Z]/.test(content)) detectedLangs.push("en"); if (/[一-鿿]/.test(content)) detectedLangs.push("zh"); if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja"); if (/[가-힯]/.test(content)) detectedLangs.push("ko"); return { contentLength: totalChars, lineCount: lines.length, averageLineLength: nonEmptyLines.length > 0 ? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length : 0, emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length, specialCharRatio: specialChars / totalChars, estimatedLanguages: detectedLangs, }; } const config = { enableQualityProcessing: true, }; const bytes = new Uint8Array(buffer); const result = await extractBytes(bytes, "application/pdf", config); const metrics = assessTextQuality(result.content); console.log("Text Quality Assessment:"); console.log(` Length: ${metrics.contentLength} characters`); console.log( ` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`, ); console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`); console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`); console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`); console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`); ``` ```typescript title="WASM - Quality-Based Content Filtering" import init, { extractBytes } from "kreuzberg-wasm"; await init(); const config = { enableQualityProcessing: true, chunking: { maxChars: 1000, chunkOverlap: 200, trim: true, }, }; const bytes = new Uint8Array(buffer); const result = await extractBytes(bytes, "application/pdf", config); interface QualityFilteredChunk { index: number; content: string; quality: number; kept: boolean; } // Filter chunks based on quality heuristics const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content const filteredChunks: QualityFilteredChunk[] = result.chunks?.map((chunk, idx) => { const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length; const hasNumbers = /\d/.test(chunk.content); const hasPunctuation = /[.!?,;:]/g.test(chunk.content); // Quality score based on content characteristics const contentQuality = (nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2; const kept = contentQuality >= qualityThreshold; return { index: idx, content: chunk.content.substring(0, 50), quality: contentQuality, kept, }; }) || []; const keptChunks = filteredChunks.filter((c) => c.kept); console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`); keptChunks.slice(0, 3).forEach((c) => { console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`); }); ``` ```typescript title="WASM - Content Encoding Validation" import init, { extractBytes } from "kreuzberg-wasm"; await init(); const config = { enableQualityProcessing: true, }; const bytes = new Uint8Array(buffer); const result = await extractBytes(bytes, "application/pdf", config); // Validate text encoding quality interface EncodingValidation { hasInvalidChars: boolean; invalidCharCount: number; replacementRatio: number; estimatedEncoding: string; } function validateEncoding(content: string): EncodingValidation { // Check for replacement characters (U+FFFD) const replacementChars = (content.match(/�/g) || []).length; const hasInvalidChars = replacementChars > 0; const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0; // Guess encoding based on content patterns const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII"; return { hasInvalidChars, invalidCharCount: replacementChars, replacementRatio, estimatedEncoding, }; } const validation = validateEncoding(result.content); console.log("Content Encoding Validation:"); console.log(` Estimated encoding: ${validation.estimatedEncoding}`); console.log(` Invalid characters: ${validation.invalidCharCount}`); console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`); console.log( ` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`, ); console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`); ```