Files
fil/docs/snippets/wasm/advanced/quality_processing_example.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

5.0 KiB
Raw Blame History

import init, { extractBytes } from "kreuzberg-wasm";

await init();

interface TextQualityMetrics {
  contentLength: number;
  lineCount: number;
  averageLineLength: number;
  emptyLineRatio: number;
  specialCharRatio: number;
  estimatedLanguages: string[];
}

function assessTextQuality(content: string): TextQualityMetrics {
  const lines = content.split(/\n+/);
  const nonEmptyLines = lines.filter((l) => l.trim().length > 0);
  const totalChars = content.length;
  const specialChars = (content.match(/[^\w\s.,:;!?\n]/g) || []).length;

  // Simple language detection by character patterns
  const detectedLangs: string[] = [];
  if (/[a-zA-Z]/.test(content)) detectedLangs.push("en");
  if (/[一-鿿]/.test(content)) detectedLangs.push("zh");
  if (/[぀-ゟ゠-ヿ]/.test(content)) detectedLangs.push("ja");
  if (/[가-힯]/.test(content)) detectedLangs.push("ko");

  return {
    contentLength: totalChars,
    lineCount: lines.length,
    averageLineLength:
      nonEmptyLines.length > 0
        ? nonEmptyLines.reduce((sum, l) => sum + l.length, 0) / nonEmptyLines.length
        : 0,
    emptyLineRatio: (lines.length - nonEmptyLines.length) / lines.length,
    specialCharRatio: specialChars / totalChars,
    estimatedLanguages: detectedLangs,
  };
}

const config = {
  enableQualityProcessing: true,
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

const metrics = assessTextQuality(result.content);

console.log("Text Quality Assessment:");
console.log(`  Length: ${metrics.contentLength} characters`);
console.log(
  `  Lines: ${metrics.lineCount} total, avg ${metrics.averageLineLength.toFixed(1)} chars/line`,
);
console.log(`  Empty lines: ${(metrics.emptyLineRatio * 100).toFixed(1)}%`);
console.log(`  Special chars: ${(metrics.specialCharRatio * 100).toFixed(2)}%`);
console.log(`  Languages: ${metrics.estimatedLanguages.join(", ") || "unknown"}`);
console.log(`  Kreuzberg quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  enableQualityProcessing: true,
  chunking: {
    maxChars: 1000,
    chunkOverlap: 200,
    trim: true,
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

interface QualityFilteredChunk {
  index: number;
  content: string;
  quality: number;
  kept: boolean;
}

// Filter chunks based on quality heuristics
const qualityThreshold = 0.3; // Min ratio of visible/non-whitespace content
const filteredChunks: QualityFilteredChunk[] =
  result.chunks?.map((chunk, idx) => {
    const nonWhitespaceRatio = chunk.content.replace(/\s/g, "").length / chunk.content.length;
    const hasNumbers = /\d/.test(chunk.content);
    const hasPunctuation = /[.!?,;:]/g.test(chunk.content);

    // Quality score based on content characteristics
    const contentQuality =
      (nonWhitespaceRatio + (hasNumbers ? 0.2 : 0) + (hasPunctuation ? 0.1 : 0)) / 2;
    const kept = contentQuality >= qualityThreshold;

    return {
      index: idx,
      content: chunk.content.substring(0, 50),
      quality: contentQuality,
      kept,
    };
  }) || [];

const keptChunks = filteredChunks.filter((c) => c.kept);
console.log(`Quality-filtered chunks: ${keptChunks.length}/${filteredChunks.length}`);

keptChunks.slice(0, 3).forEach((c) => {
  console.log(`  Chunk ${c.index}: quality=${c.quality.toFixed(2)}, "${c.content}..."`);
});
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  enableQualityProcessing: true,
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

// Validate text encoding quality
interface EncodingValidation {
  hasInvalidChars: boolean;
  invalidCharCount: number;
  replacementRatio: number;
  estimatedEncoding: string;
}

function validateEncoding(content: string): EncodingValidation {
  // Check for replacement characters (U+FFFD)
  const replacementChars = (content.match(/<2F>/g) || []).length;
  const hasInvalidChars = replacementChars > 0;
  const replacementRatio = hasInvalidChars ? replacementChars / content.length : 0;

  // Guess encoding based on content patterns
  const estimatedEncoding = /[^\x00-\x7F]/.test(content) ? "UTF-8" : "ASCII";

  return {
    hasInvalidChars,
    invalidCharCount: replacementChars,
    replacementRatio,
    estimatedEncoding,
  };
}

const validation = validateEncoding(result.content);

console.log("Content Encoding Validation:");
console.log(`  Estimated encoding: ${validation.estimatedEncoding}`);
console.log(`  Invalid characters: ${validation.invalidCharCount}`);
console.log(`  Replacement ratio: ${(validation.replacementRatio * 100).toFixed(4)}%`);
console.log(
  `  Status: ${validation.hasInvalidChars ? "DEGRADED - encoding issues detected" : "OK"}`,
);
console.log(`  Quality score: ${result.qualityScore?.toFixed(3) || "N/A"}`);