Files
fil/docs/snippets/wasm/advanced/language_detection_config.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

2.6 KiB

import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  languageDetection: {
    enabled: true,
    minConfidence: 0.75,
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Language confidence: ${result.metadata?.languageConfidence}`);
console.log(`Detected languages: ${result.detectedLanguages?.join(", ")}`);
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  languageDetection: {
    enabled: true,
    minConfidence: 0.6,
    detectMultiple: true, // Enable detection of multiple languages
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);

console.log(`Primary language: ${result.metadata?.language}`);
console.log(`Confidence score: ${result.metadata?.languageConfidence?.toFixed(3)}`);
console.log(`All detected languages: ${result.detectedLanguages?.join(", ")}`);

// Use detected language for downstream processing
if (result.detectedLanguages && result.detectedLanguages.length > 1) {
  console.log("Document contains multiple languages - enable multilingual NLP processing");
}
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const config = {
  languageDetection: {
    enabled: true,
    minConfidence: 0.8,
    detectMultiple: false,
  },
  // Adjust extraction parameters based on detected language
  quality: {
    enableQualityProcessing: true,
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "text/plain", config);

// Process result based on detected language
const language = result.metadata?.language;
console.log(`Processing document in language: ${language}`);

// Example: Apply language-specific rules
const languageConfig: Record<string, { cleanWhitespace: boolean; normalizeText: boolean }> = {
  en: { cleanWhitespace: true, normalizeText: true },
  zh: { cleanWhitespace: false, normalizeText: true }, // Chinese: preserve whitespace patterns
  ja: { cleanWhitespace: false, normalizeText: false }, // Japanese: preserve as-is
  ar: { cleanWhitespace: true, normalizeText: true }, // Arabic
};

const langConfig = languageConfig[language as string] || {
  cleanWhitespace: true,
  normalizeText: true,
};
console.log(`Language config: ${JSON.stringify(langConfig)}`);