This commit is contained in:
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
161
docs/snippets/wasm/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,161 @@
|
||||
```typescript title="WASM - Assess Text Quality"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
interface TextQualityMetrics {
|
||||
contentLength: number;
|
||||
lineCount: number;
|
||||
averageLineLength: number;
|
||||
emptyLineRatio: number;
|
||||
specialCharRatio: number;
|
||||
estimatedLanguages: string[];
|
||||
}
|
||||
|
||||
function assessTextQuality(content: string): TextQualityMetrics {
|
||||
const lines = content.split(/\n+/);
|
||||
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
|
||||
const totalChars = content.length;
|
||||
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
|
||||
|
||||
// Simple language detection by character patterns
|
||||
const detectedLangs: string[] = [];
|
||||
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
|
||||
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
|
||||
if (/[-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
|
||||
if (/[가-]/.test(content)) detectedLangs.push("ko");
|
||||
|
||||
return {
|
||||
contentLength: totalChars,
|
||||
lineCount: lines.length,
|
||||
averageLineLength:
|
||||
nonEmptyLines.length > 0
|
||||
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
|
||||
: 0,
|
||||
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
|
||||
specialCharRatio: specialChars / totalChars,
|
||||
estimatedLanguages: detectedLangs,
|
||||
};
|
||||
}
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
const metrics = assessTextQuality(result.content);
|
||||
|
||||
console.log("Text Quality Assessment:");
|
||||
console.log(` Length: ${metrics.contentLength} characters`);
|
||||
console.log(
|
||||
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
|
||||
);
|
||||
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
|
||||
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
|
||||
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
|
||||
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
|
||||
```typescript title="WASM - Quality-Based Content Filtering"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
trim: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
interface QualityFilteredChunk {
|
||||
index: number;
|
||||
content: string;
|
||||
quality: number;
|
||||
kept: boolean;
|
||||
}
|
||||
|
||||
// Filter chunks based on quality heuristics
|
||||
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
|
||||
const filteredChunks: QualityFilteredChunk[] =
|
||||
result.chunks?.map((chunk, idx) => {
|
||||
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
|
||||
const hasNumbers = /\d/.test(chunk.content);
|
||||
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
|
||||
|
||||
// Quality score based on content characteristics
|
||||
const contentQuality =
|
||||
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
|
||||
const kept = contentQuality >= qualityThreshold;
|
||||
|
||||
return {
|
||||
index: idx,
|
||||
content: chunk.content.substring(0, 50),
|
||||
quality: contentQuality,
|
||||
kept,
|
||||
};
|
||||
}) || [];
|
||||
|
||||
const keptChunks = filteredChunks.filter((c) => c.kept);
|
||||
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
|
||||
|
||||
keptChunks.slice(0, 3).forEach((c) => {
|
||||
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Content Encoding Validation"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const config = {
|
||||
enableQualityProcessing: true,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
// Validate text encoding quality
|
||||
interface EncodingValidation {
|
||||
hasInvalidChars: boolean;
|
||||
invalidCharCount: number;
|
||||
replacementRatio: number;
|
||||
estimatedEncoding: string;
|
||||
}
|
||||
|
||||
function validateEncoding(content: string): EncodingValidation {
|
||||
// Check for replacement characters (U+FFFD)
|
||||
const replacementChars = (content.match(/<2F>/g) || []).length;
|
||||
const hasInvalidChars = replacementChars > 0;
|
||||
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
|
||||
|
||||
// Guess encoding based on content patterns
|
||||
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
|
||||
|
||||
return {
|
||||
hasInvalidChars,
|
||||
invalidCharCount: replacementChars,
|
||||
replacementRatio,
|
||||
estimatedEncoding,
|
||||
};
|
||||
}
|
||||
|
||||
const validation = validateEncoding(result.content);
|
||||
|
||||
console.log("Content Encoding Validation:");
|
||||
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
|
||||
console.log(` Invalid characters: ${validation.invalidCharCount}`);
|
||||
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
|
||||
console.log(
|
||||
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
|
||||
);
|
||||
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
||||
```
|
||||
Reference in New Issue
Block a user