Files
fil/docs/snippets/wasm/api/combining_all_features.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1.6 KiB

import init, { extractBytes } from "kreuzberg-wasm";

await init();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const bytes = new Uint8Array(await file.arrayBuffer());

  // Build a comprehensive extraction config
  const config = {
    use_cache: true,
    enable_quality_processing: true,
    output_format: "markdown",
    include_document_structure: true,

    // Chunking configuration
    chunking: {
      strategy: "semantic",
      max_chunk_size: 1024,
      overlap: 100,
    },

    // Image extraction configuration
    images: {
      extract_images: true,
      extract_base64: false,
      extract_raw_bytes: false,
    },

    // OCR configuration
    ocr: {
      backend: "tesseract",
      languages: ["eng"],
      enabled: true,
    },

    // HTML-specific extraction options
    html_options: "article, main, .content",

    // PDF-specific options
    pdf_options: {
      ocr_strategy: "auto",
      preserve_images: true,
    },

    // Security limits
    security_limits: {
      max_archive_size: 524288000,
      max_file_count: 10000,
      max_compression_ratio: 100,
    },
  };

  try {
    const result = await extractBytes(bytes, file.type || "application/octet-stream", config);
    console.log(`Content: ${result.content.substring(0, 100)}...`);
    console.log(`Language: ${result.metadata?.language ?? "Unknown"}`);
    console.log(`Chunks: ${result.chunks?.length ?? 0}`);
    if (result.images?.length) {
      console.log(`Images: ${result.images.length}`);
    }
  } catch (err) {
    console.error("Extraction failed:", err);
  }
}