Files
fil/docs/snippets/wasm/advanced/quality_processing_example.md

162 lines
5.0 KiB
Markdown
Raw Normal View History

2026-06-01 23:40:55 +02:00
```typescript title="WASM - Assess Text Quality"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
interface TextQualityMetrics {
contentLength: number;
lineCount: number;
averageLineLength: number;
emptyLineRatio: number;
specialCharRatio: number;
estimatedLanguages: string[];
}
function assessTextQuality(content: string): TextQualityMetrics {
const lines = content.split(/\n+/);
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
const totalChars = content.length;
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
// Simple language detection by character patterns
const detectedLangs: string[] = [];
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
if (/[가-힯]/.test(content)) detectedLangs.push("ko");
return {
contentLength: totalChars,
lineCount: lines.length,
averageLineLength:
nonEmptyLines.length > 0
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
: 0,
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
specialCharRatio: specialChars / totalChars,
estimatedLanguages: detectedLangs,
};
}
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
const metrics = assessTextQuality(result.content);
console.log("Text Quality Assessment:");
console.log(` Length: ${metrics.contentLength} characters`);
console.log(
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
);
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```
```typescript title="WASM - Quality-Based Content Filtering"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
chunking: {
maxChars: 1000,
chunkOverlap: 200,
trim: true,
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
interface QualityFilteredChunk {
index: number;
content: string;
quality: number;
kept: boolean;
}
// Filter chunks based on quality heuristics
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
const filteredChunks: QualityFilteredChunk[] =
result.chunks?.map((chunk, idx) => {
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
const hasNumbers = /\d/.test(chunk.content);
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
// Quality score based on content characteristics
const contentQuality =
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
const kept = contentQuality >= qualityThreshold;
return {
index: idx,
content: chunk.content.substring(0, 50),
quality: contentQuality,
kept,
};
}) || [];
const keptChunks = filteredChunks.filter((c) => c.kept);
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
keptChunks.slice(0, 3).forEach((c) => {
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
});
```
```typescript title="WASM - Content Encoding Validation"
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const config = {
enableQualityProcessing: true,
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
// Validate text encoding quality
interface EncodingValidation {
hasInvalidChars: boolean;
invalidCharCount: number;
replacementRatio: number;
estimatedEncoding: string;
}
function validateEncoding(content: string): EncodingValidation {
// Check for replacement characters (U+FFFD)
const replacementChars = (content.match(/<2F>/g) || []).length;
const hasInvalidChars = replacementChars > 0;
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
// Guess encoding based on content patterns
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
return {
hasInvalidChars,
invalidCharCount: replacementChars,
replacementRatio,
estimatedEncoding,
};
}
const validation = validateEncoding(result.content);
console.log("Content Encoding Validation:");
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
console.log(` Invalid characters: ${validation.invalidCharCount}`);
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
console.log(
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
);
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
```