This commit is contained in:
35
docs/snippets/wasm/config/advanced_config.md
Normal file
35
docs/snippets/wasm/config/advanced_config.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
enable_language_detection: true,
|
||||
enable_quality: true,
|
||||
};
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file, file.type, config);
|
||||
|
||||
if (result.chunks) {
|
||||
for (const chunk of result.chunks) {
|
||||
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.detectedLanguages) {
|
||||
console.log(`Languages: ${result.detectedLanguages.join(", ")}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
29
docs/snippets/wasm/config/basic-config.ts
Normal file
29
docs/snippets/wasm/config/basic-config.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 200,
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log("Extraction complete");
|
||||
console.log("Content length:", result.content.length);
|
||||
}
|
||||
|
||||
extractWithConfig().catch(console.error);
|
||||
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
66
docs/snippets/wasm/config/chunking-config.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithChunking() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("book.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 800,
|
||||
chunkOverlap: 150,
|
||||
splitOnNewlines: true,
|
||||
splitOnSentences: true,
|
||||
},
|
||||
};
|
||||
|
||||
// Example: prepend heading context so each chunk carries its heading breadcrumb
|
||||
const configWithHeadings: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
console.log(`\nChunk ${i}:`);
|
||||
console.log(`Chars: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
|
||||
console.log(`Content: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function extractWithPrependHeadingContext() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.md").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 800,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
if (result.chunks) {
|
||||
console.log(`Total chunks: ${result.chunks.length}`);
|
||||
|
||||
result.chunks.slice(0, 3).forEach((chunk, i) => {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`\nChunk ${i}: ${chunk.content.substring(0, 100)}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractWithChunking().catch(console.error);
|
||||
extractWithPrependHeadingContext().catch(console.error);
|
||||
71
docs/snippets/wasm/config/chunking_config.md
Normal file
71
docs/snippets/wasm/config/chunking_config.md
Normal file
@@ -0,0 +1,71 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 100,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
|
||||
console.log(`Tokens: ${chunk.metadata?.token_count}`);
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Markdown with Heading Context"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 2000,
|
||||
// Note: Token-based sizing is not available in WASM builds.
|
||||
// Use character-based sizing instead.
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
|
||||
|
||||
if (chunk.metadata?.headingContext?.headings) {
|
||||
console.log("Headings:");
|
||||
chunk.metadata.headingContext.headings.forEach((h) => {
|
||||
console.log(` Level ${h.level}: ${h.text}`);
|
||||
});
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
```typescript title="WASM - Prepend Heading Context"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
chunking: {
|
||||
chunkerType: "markdown",
|
||||
maxChars: 2000,
|
||||
prependHeadingContext: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "text/markdown", config);
|
||||
|
||||
result.chunks?.forEach((chunk, idx) => {
|
||||
// Each chunk's content is prefixed with its heading breadcrumb
|
||||
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
|
||||
});
|
||||
```
|
||||
34
docs/snippets/wasm/config/combined-config.ts
Normal file
34
docs/snippets/wasm/config/combined-config.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithFullConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("complex.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "deu",
|
||||
},
|
||||
chunking: {
|
||||
maxChars: 1000,
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 200,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("=== Extraction Results ===");
|
||||
console.log(`Content: ${result.content.length} chars`);
|
||||
console.log(`Chunks: ${result.chunks?.length ?? 0}`);
|
||||
console.log(`Images: ${result.images?.length ?? 0}`);
|
||||
console.log(`Tables: ${result.tables.length}`);
|
||||
console.log(`Languages: ${result.detectedLanguages?.join(", ")}`);
|
||||
}
|
||||
|
||||
extractWithFullConfig().catch(console.error);
|
||||
34
docs/snippets/wasm/config/conditional-config.ts
Normal file
34
docs/snippets/wasm/config/conditional-config.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithConditionalConfig(fileSize: number) {
|
||||
await initWasm();
|
||||
|
||||
const config: ExtractionConfig = {};
|
||||
|
||||
if (fileSize > 10 * 1024 * 1024) {
|
||||
config.chunking = {
|
||||
maxChars: 500,
|
||||
chunkOverlap: 50,
|
||||
};
|
||||
}
|
||||
|
||||
if (fileSize < 1 * 1024 * 1024) {
|
||||
config.images = {
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
};
|
||||
}
|
||||
|
||||
config.ocr = {
|
||||
enabled: fileSize < 50 * 1024 * 1024,
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
extractWithConditionalConfig(5 * 1024 * 1024).then((_r) => console.log("Done"));
|
||||
19
docs/snippets/wasm/config/config_basic.md
Normal file
19
docs/snippets/wasm/config/config_basic.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
images: {
|
||||
extractImages: true,
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log(result.content);
|
||||
```
|
||||
18
docs/snippets/wasm/config/config_discover.md
Normal file
18
docs/snippets/wasm/config/config_discover.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import { initWasm, extractBytes } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const config = {
|
||||
use_cache: true,
|
||||
enable_quality_processing: true,
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const bytes = new Uint8Array(buffer);
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
console.log(result.content);
|
||||
```
|
||||
18
docs/snippets/wasm/config/config_ocr.md
Normal file
18
docs/snippets/wasm/config/config_ocr.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
console.log(`Tables detected: ${result.tables?.length || 0}`);
|
||||
```
|
||||
26
docs/snippets/wasm/config/config_programmatic.md
Normal file
26
docs/snippets/wasm/config/config_programmatic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
use_cache: true,
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract_config: {
|
||||
psm: 6,
|
||||
},
|
||||
},
|
||||
chunking: {
|
||||
max_characters: 1000,
|
||||
overlap: 200,
|
||||
},
|
||||
enable_quality_processing: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Content length: ${result.content.length}`);
|
||||
```
|
||||
15
docs/snippets/wasm/config/document_structure_config.md
Normal file
15
docs/snippets/wasm/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```typescript title="Document Structure Config (WASM)"
|
||||
import { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
const config = {
|
||||
includeDocumentStructure: true,
|
||||
};
|
||||
|
||||
const result = extractBytes(fileBuffer, "application/pdf", config);
|
||||
|
||||
if (result.document) {
|
||||
for (const node of result.document.nodes) {
|
||||
console.log(`[${node.content.nodeType}]`);
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/wasm/config/element_based_output.md
Normal file
35
docs/snippets/wasm/config/element_based_output.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```typescript title="Element-Based Output (WASM)"
|
||||
import { extractFileSync, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
// Configure element-based output
|
||||
const config: ExtractionConfig = {
|
||||
outputFormat: "element_based",
|
||||
};
|
||||
|
||||
// Extract document
|
||||
const result = extractFileSync(fileBuffer, "application/pdf", config);
|
||||
|
||||
// Access elements
|
||||
for (const element of result.elements) {
|
||||
console.log(`Type: ${element.elementType}`);
|
||||
console.log(`Text: ${element.text.slice(0, 100)}`);
|
||||
|
||||
if (element.metadata.pageNumber) {
|
||||
console.log(`Page: ${element.metadata.pageNumber}`);
|
||||
}
|
||||
|
||||
if (element.metadata.coordinates) {
|
||||
const coords = element.metadata.coordinates;
|
||||
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
|
||||
}
|
||||
|
||||
console.log("---");
|
||||
}
|
||||
|
||||
// Filter by element type
|
||||
const titles = result.elements.filter((e) => e.elementType === "title");
|
||||
for (const title of titles) {
|
||||
const level = title.metadata.additional?.level || "unknown";
|
||||
console.log(`[${level}] ${title.text}`);
|
||||
}
|
||||
```
|
||||
105
docs/snippets/wasm/config/hierarchy_config.ts
Normal file
105
docs/snippets/wasm/config/hierarchy_config.ts
Normal file
@@ -0,0 +1,105 @@
|
||||
import {
|
||||
type ExtractionConfig,
|
||||
type HierarchyConfig,
|
||||
Kreuzberg,
|
||||
type PdfConfig,
|
||||
} from "kreuzberg-wasm";
|
||||
|
||||
// Example 1: Basic hierarchy extraction
|
||||
// Enabled with default kClusters=6 for standard H1-H6 heading hierarchy.
|
||||
// Extract bounding box information for spatial layout awareness.
|
||||
const hierarchyConfigBasic: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6, // Default: creates 6 font size clusters (H1-H6 structure)
|
||||
includeBbox: true, // Include bounding box coordinates
|
||||
ocrCoverageThreshold: undefined, // No OCR coverage threshold
|
||||
};
|
||||
|
||||
const pdfConfigBasic: PdfConfig = {
|
||||
hierarchy: hierarchyConfigBasic,
|
||||
};
|
||||
|
||||
const extractionConfigBasic: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigBasic,
|
||||
};
|
||||
|
||||
// const kreuzberg = new Kreuzberg(extractionConfigBasic);
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Example 2: Custom kClusters for minimal structure
|
||||
// Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
// Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
const hierarchyConfigMinimal: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 3, // Minimal clustering: just 3 levels
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: undefined,
|
||||
};
|
||||
|
||||
const pdfConfigMinimal: PdfConfig = {
|
||||
hierarchy: hierarchyConfigMinimal,
|
||||
};
|
||||
|
||||
const _extractionConfigMinimal: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigMinimal,
|
||||
};
|
||||
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Example 3: With OCR coverage threshold
|
||||
// Trigger OCR if less than 50% of text has font data.
|
||||
// Useful for documents with mixed digital and scanned content.
|
||||
const hierarchyConfigOcr: HierarchyConfig = {
|
||||
enabled: true,
|
||||
kClusters: 6,
|
||||
includeBbox: true,
|
||||
ocrCoverageThreshold: 0.5, // Trigger OCR if text coverage < 50%
|
||||
};
|
||||
|
||||
const pdfConfigOcr: PdfConfig = {
|
||||
hierarchy: hierarchyConfigOcr,
|
||||
};
|
||||
|
||||
const _extractionConfigOcr: ExtractionConfig = {
|
||||
pdfOptions: pdfConfigOcr,
|
||||
};
|
||||
|
||||
// const result = await kreuzberg.extractFile("document.pdf");
|
||||
|
||||
// Integration with Kreuzberg WASM instance
|
||||
async function _extractWithHierarchy(): Promise<void> {
|
||||
const config = extractionConfigBasic;
|
||||
const kreuzberg = new Kreuzberg(config);
|
||||
|
||||
try {
|
||||
// Extract from file (requires file input or fetch)
|
||||
const result = await kreuzberg.extractFile("document.pdf");
|
||||
console.log("Extraction complete:", result);
|
||||
} catch (error) {
|
||||
console.error("Extraction failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
// Field descriptions:
|
||||
//
|
||||
// enabled: boolean (default: true)
|
||||
// - Enable or disable hierarchy extraction
|
||||
// - When false, hierarchy structure is not analyzed
|
||||
//
|
||||
// kClusters: number (default: 6, valid: 1-7)
|
||||
// - Number of font size clusters for hierarchy levels
|
||||
// - 6 provides H1-H6 heading levels with body text
|
||||
// - Higher values create more fine-grained hierarchy
|
||||
// - Lower values create simpler structure
|
||||
//
|
||||
// includeBbox: boolean (default: true)
|
||||
// - Include bounding box coordinates in hierarchy blocks
|
||||
// - Required for spatial layout awareness and document structure
|
||||
// - Set to false only if space optimization is critical
|
||||
//
|
||||
// ocrCoverageThreshold: number | undefined (default: undefined)
|
||||
// - Range: 0.0 to 1.0
|
||||
// - Triggers OCR when text block coverage falls below this fraction
|
||||
// - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
// - undefined means no OCR coverage-based triggering
|
||||
//
|
||||
17
docs/snippets/wasm/config/html_output.md
Normal file
17
docs/snippets/wasm/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
output_format: "html",
|
||||
html_output: {
|
||||
theme: "github",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(result.content); // HTML with kb-* classes
|
||||
```
|
||||
31
docs/snippets/wasm/config/image-config.ts
Normal file
31
docs/snippets/wasm/config/image-config.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractImagesWithConfig() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 300,
|
||||
maxDimension: 2048,
|
||||
preserveAspectRatio: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.images) {
|
||||
console.log(`Extracted ${result.images.length} images`);
|
||||
|
||||
result.images.forEach((image) => {
|
||||
console.log(
|
||||
`Image: ${image.width}x${image.height}, Format: ${image.format}, DPI: ${image.description}`,
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractImagesWithConfig().catch(console.error);
|
||||
65
docs/snippets/wasm/config/keyword_config.ts
Normal file
65
docs/snippets/wasm/config/keyword_config.ts
Normal file
@@ -0,0 +1,65 @@
|
||||
import { type ExtractionConfig, extractFile } from "@kreuzberg/wasm";
|
||||
|
||||
// Example 1: Basic YAKE configuration
|
||||
// Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
async function basicYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 10,
|
||||
minScore: 0.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 2: Advanced YAKE with custom parameters
|
||||
// Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
async function _advancedYake(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
maxKeywords: 15,
|
||||
minScore: 0.1,
|
||||
ngramRange: [1, 2],
|
||||
language: "en",
|
||||
yakeParams: {
|
||||
windowSize: 1,
|
||||
},
|
||||
rakeParams: null,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
// Example 3: RAKE configuration
|
||||
// Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
async function _rakeConfig(): Promise<void> {
|
||||
const config: ExtractionConfig = {
|
||||
keywords: {
|
||||
algorithm: "rake",
|
||||
maxKeywords: 10,
|
||||
minScore: 5.0,
|
||||
ngramRange: [1, 3],
|
||||
language: "en",
|
||||
yakeParams: null,
|
||||
rakeParams: {
|
||||
minWordLength: 1,
|
||||
maxWordsPerPhrase: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractFile("document.pdf", null, config);
|
||||
console.log("Keywords:", result.keywords);
|
||||
}
|
||||
|
||||
basicYake();
|
||||
20
docs/snippets/wasm/config/keyword_extraction_config.md
Normal file
20
docs/snippets/wasm/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
keywords: {
|
||||
algorithm: "yake",
|
||||
max_keywords: 10,
|
||||
min_score: 0.1,
|
||||
ngram_range: [1, 3],
|
||||
language: "en",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Keywords: ${JSON.stringify(result.keywords)}`);
|
||||
```
|
||||
19
docs/snippets/wasm/config/language_detection_config.md
Normal file
19
docs/snippets/wasm/config/language_detection_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
language_detection: {
|
||||
enabled: true,
|
||||
min_confidence: 0.8,
|
||||
detect_multiple: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Detected language: ${result.language}`);
|
||||
console.log(`Confidence: ${result.language_confidence}`);
|
||||
```
|
||||
26
docs/snippets/wasm/config/ocr-config.ts
Normal file
26
docs/snippets/wasm/config/ocr-config.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithOcr() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
ocr: {
|
||||
backend: "tesseract-wasm",
|
||||
language: "eng",
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("Extracted text from scanned document:");
|
||||
console.log(result.content);
|
||||
|
||||
if (result.detectedLanguages) {
|
||||
console.log("Detected languages:", result.detectedLanguages);
|
||||
}
|
||||
}
|
||||
|
||||
extractWithOcr().catch(console.error);
|
||||
21
docs/snippets/wasm/config/ocr_dpi_config.md
Normal file
21
docs/snippets/wasm/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
images: {
|
||||
extract_images: true,
|
||||
target_dpi: 300,
|
||||
max_image_dimension: 4096,
|
||||
auto_adjust_dpi: true,
|
||||
min_dpi: 150,
|
||||
max_dpi: 600,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Extracted images: ${result.images?.length || 0}`);
|
||||
```
|
||||
20
docs/snippets/wasm/config/pdf_config.md
Normal file
20
docs/snippets/wasm/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("encrypted.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
pdf_options: {
|
||||
extract_images: true,
|
||||
passwords: ["password123"],
|
||||
extract_metadata: true,
|
||||
hierarchy: {},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Title: ${result.metadata?.title}`);
|
||||
console.log(`Authors: ${result.metadata?.authors}`);
|
||||
```
|
||||
22
docs/snippets/wasm/config/pdf_hierarchy_config.md
Normal file
22
docs/snippets/wasm/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
pdf_options: {
|
||||
hierarchy: {
|
||||
enabled: true,
|
||||
detection_threshold: 0.75,
|
||||
ocr_coverage_threshold: 0.8,
|
||||
min_level: 1,
|
||||
max_level: 5,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Hierarchy levels: ${result.hierarchy?.length || 0}`);
|
||||
```
|
||||
17
docs/snippets/wasm/config/postprocessor_config.md
Normal file
17
docs/snippets/wasm/config/postprocessor_config.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
postprocessor: {
|
||||
enabled: true,
|
||||
enabled_processors: ["whitespace_normalizer", "unicode_normalizer"],
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Processed content: ${result.content}`);
|
||||
```
|
||||
16
docs/snippets/wasm/config/quality_processing_config.md
Normal file
16
docs/snippets/wasm/config/quality_processing_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
enable_quality_processing: true,
|
||||
use_cache: true,
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Quality score: ${result.quality_score}`);
|
||||
console.log(`Processing time: ${result.processing_time}`);
|
||||
```
|
||||
21
docs/snippets/wasm/config/tesseract_config.md
Normal file
21
docs/snippets/wasm/config/tesseract_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("scanned.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
ocr: {
|
||||
backend: "tesseract",
|
||||
language: "eng+deu",
|
||||
tesseract_config: {
|
||||
psm: 6,
|
||||
oem: 3,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`OCR text: ${result.content}`);
|
||||
```
|
||||
18
docs/snippets/wasm/config/token_reduction_config.md
Normal file
18
docs/snippets/wasm/config/token_reduction_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const data = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config = {
|
||||
token_reduction: {
|
||||
mode: "moderate",
|
||||
preserve_important_words: true,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(data, "application/pdf", config);
|
||||
console.log(`Original tokens: ${result.token_count}`);
|
||||
console.log(`Reduced content: ${result.content}`);
|
||||
```
|
||||
Reference in New Issue
Block a user