162 lines
5.0 KiB
Markdown
162 lines
5.0 KiB
Markdown
|
|
```typescript title="WASM - Assess Text Quality"
|
|||
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
|||
|
|
|
|||
|
|
await init();
|
|||
|
|
|
|||
|
|
interface TextQualityMetrics {
|
|||
|
|
contentLength: number;
|
|||
|
|
lineCount: number;
|
|||
|
|
averageLineLength: number;
|
|||
|
|
emptyLineRatio: number;
|
|||
|
|
specialCharRatio: number;
|
|||
|
|
estimatedLanguages: string[];
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function assessTextQuality(content: string): TextQualityMetrics {
|
|||
|
|
const lines = content.split(/\n+/);
|
|||
|
|
const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
|
|||
|
|
const totalChars = content.length;
|
|||
|
|
const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;
|
|||
|
|
|
|||
|
|
// Simple language detection by character patterns
|
|||
|
|
const detectedLangs: string[] = [];
|
|||
|
|
if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
|
|||
|
|
if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
|
|||
|
|
if (/[-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
|
|||
|
|
if (/[가-]/.test(content)) detectedLangs.push("ko");
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
contentLength: totalChars,
|
|||
|
|
lineCount: lines.length,
|
|||
|
|
averageLineLength:
|
|||
|
|
nonEmptyLines.length > 0
|
|||
|
|
? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
|
|||
|
|
: 0,
|
|||
|
|
emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
|
|||
|
|
specialCharRatio: specialChars / totalChars,
|
|||
|
|
estimatedLanguages: detectedLangs,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const config = {
|
|||
|
|
enableQualityProcessing: true,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const bytes = new Uint8Array(buffer);
|
|||
|
|
const result = await extractBytes(bytes, "application/pdf", config);
|
|||
|
|
|
|||
|
|
const metrics = assessTextQuality(result.content);
|
|||
|
|
|
|||
|
|
console.log("Text Quality Assessment:");
|
|||
|
|
console.log(` Length: ${metrics.contentLength} characters`);
|
|||
|
|
console.log(
|
|||
|
|
` Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
|
|||
|
|
);
|
|||
|
|
console.log(` Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
|
|||
|
|
console.log(` Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
|
|||
|
|
console.log(` Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
|
|||
|
|
console.log(` Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
```typescript title="WASM - Quality-Based Content Filtering"
|
|||
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
|||
|
|
|
|||
|
|
await init();
|
|||
|
|
|
|||
|
|
const config = {
|
|||
|
|
enableQualityProcessing: true,
|
|||
|
|
chunking: {
|
|||
|
|
maxChars: 1000,
|
|||
|
|
chunkOverlap: 200,
|
|||
|
|
trim: true,
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const bytes = new Uint8Array(buffer);
|
|||
|
|
const result = await extractBytes(bytes, "application/pdf", config);
|
|||
|
|
|
|||
|
|
interface QualityFilteredChunk {
|
|||
|
|
index: number;
|
|||
|
|
content: string;
|
|||
|
|
quality: number;
|
|||
|
|
kept: boolean;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Filter chunks based on quality heuristics
|
|||
|
|
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
|
|||
|
|
const filteredChunks: QualityFilteredChunk[] =
|
|||
|
|
result.chunks?.map((chunk, idx) => {
|
|||
|
|
const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
|
|||
|
|
const hasNumbers = /\d/.test(chunk.content);
|
|||
|
|
const hasPunctuation = /[.!?,;:]/g.test(chunk.content);
|
|||
|
|
|
|||
|
|
// Quality score based on content characteristics
|
|||
|
|
const contentQuality =
|
|||
|
|
(nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
|
|||
|
|
const kept = contentQuality >= qualityThreshold;
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
index: idx,
|
|||
|
|
content: chunk.content.substring(0, 50),
|
|||
|
|
quality: contentQuality,
|
|||
|
|
kept,
|
|||
|
|
};
|
|||
|
|
}) || [];
|
|||
|
|
|
|||
|
|
const keptChunks = filteredChunks.filter((c) => c.kept);
|
|||
|
|
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);
|
|||
|
|
|
|||
|
|
keptChunks.slice(0, 3).forEach((c) => {
|
|||
|
|
console.log(` Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
|
|||
|
|
});
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
```typescript title="WASM - Content Encoding Validation"
|
|||
|
|
import init, { extractBytes } from "kreuzberg-wasm";
|
|||
|
|
|
|||
|
|
await init();
|
|||
|
|
|
|||
|
|
const config = {
|
|||
|
|
enableQualityProcessing: true,
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const bytes = new Uint8Array(buffer);
|
|||
|
|
const result = await extractBytes(bytes, "application/pdf", config);
|
|||
|
|
|
|||
|
|
// Validate text encoding quality
|
|||
|
|
interface EncodingValidation {
|
|||
|
|
hasInvalidChars: boolean;
|
|||
|
|
invalidCharCount: number;
|
|||
|
|
replacementRatio: number;
|
|||
|
|
estimatedEncoding: string;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
function validateEncoding(content: string): EncodingValidation {
|
|||
|
|
// Check for replacement characters (U+FFFD)
|
|||
|
|
const replacementChars = (content.match(/<2F>/g) || []).length;
|
|||
|
|
const hasInvalidChars = replacementChars > 0;
|
|||
|
|
const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;
|
|||
|
|
|
|||
|
|
// Guess encoding based on content patterns
|
|||
|
|
const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
hasInvalidChars,
|
|||
|
|
invalidCharCount: replacementChars,
|
|||
|
|
replacementRatio,
|
|||
|
|
estimatedEncoding,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const validation = validateEncoding(result.content);
|
|||
|
|
|
|||
|
|
console.log("Content Encoding Validation:");
|
|||
|
|
console.log(` Estimated encoding: ${validation.estimatedEncoding}`);
|
|||
|
|
console.log(` Invalid characters: ${validation.invalidCharCount}`);
|
|||
|
|
console.log(` Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
|
|||
|
|
console.log(
|
|||
|
|
` Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
|
|||
|
|
);
|
|||
|
|
console.log(` Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
|
|||
|
|
```
|