This commit is contained in:
25
docs/snippets/wasm/metadata/extract-metadata.ts
Normal file
25
docs/snippets/wasm/metadata/extract-metadata.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function getDocumentMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
const metadata = result.metadata;
|
||||
|
||||
console.log("Document Metadata:");
|
||||
console.log("Title:", metadata.title);
|
||||
console.log("Author:", metadata.author);
|
||||
console.log("Creator:", metadata.creator);
|
||||
console.log("Subject:", metadata.subject);
|
||||
console.log("Keywords:", metadata.keywords);
|
||||
console.log("Pages:", metadata.pageCount);
|
||||
console.log("Created:", metadata.createdAt);
|
||||
console.log("Modified:", metadata.modifiedAt);
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
getDocumentMetadata().catch(console.error);
|
||||
35
docs/snippets/wasm/metadata/filter-metadata.ts
Normal file
35
docs/snippets/wasm/metadata/filter-metadata.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
interface DocumentSummary {
|
||||
fileName: string;
|
||||
title: string | undefined;
|
||||
author: string | undefined;
|
||||
pageCount: number | undefined;
|
||||
language: string;
|
||||
}
|
||||
|
||||
async function filterAndSummarizeMetadata(files: string[]): Promise<DocumentSummary[]> {
|
||||
await initWasm();
|
||||
|
||||
const summaries: DocumentSummary[] = [];
|
||||
|
||||
for (const fileName of files) {
|
||||
const bytes = new Uint8Array(await fetch(fileName).then((r) => r.arrayBuffer()));
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf");
|
||||
|
||||
summaries.push({
|
||||
fileName,
|
||||
title: result.metadata.title,
|
||||
author: result.metadata.author,
|
||||
pageCount: result.metadata.pageCount,
|
||||
language: result.detectedLanguages?.[0] ?? "unknown",
|
||||
});
|
||||
}
|
||||
|
||||
return summaries;
|
||||
}
|
||||
|
||||
filterAndSummarizeMetadata(["doc1.pdf", "doc2.pdf", "doc3.pdf"])
|
||||
.then((summaries) => console.table(summaries))
|
||||
.catch(console.error);
|
||||
34
docs/snippets/wasm/metadata/image-metadata.ts
Normal file
34
docs/snippets/wasm/metadata/image-metadata.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractImageMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
images: {
|
||||
extractImages: true,
|
||||
targetDpi: 150,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
if (result.images) {
|
||||
result.images.forEach((image, index) => {
|
||||
console.log(`Image ${index}:`, {
|
||||
format: image.format,
|
||||
width: image.width,
|
||||
height: image.height,
|
||||
pageNumber: image.pageNumber,
|
||||
colorspace: image.colorspace,
|
||||
bitsPerComponent: image.bitsPerComponent,
|
||||
isMask: image.isMask,
|
||||
dataSize: image.data.byteLength,
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractImageMetadata().catch(console.error);
|
||||
26
docs/snippets/wasm/metadata/language_detection.md
Normal file
26
docs/snippets/wasm/metadata/language_detection.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "text/plain";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
language_detection: new LanguageDetectionConfig({
|
||||
enable_detection: true,
|
||||
target_languages: ["en", "de", "fr"],
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.detected_languages) {
|
||||
console.log("Detected languages:", result.detected_languages);
|
||||
|
||||
for (const language of result.detected_languages) {
|
||||
console.log(`Language: ${language}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,35 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { LanguageDetectionConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "text/plain";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
language_detection: new LanguageDetectionConfig({
|
||||
enable_detection: true,
|
||||
target_languages: ["en", "de", "fr", "es", "it", "ja", "zh"],
|
||||
confidence_threshold: 0.5,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.detected_languages && result.detected_languages.length > 0) {
|
||||
console.log("Document languages:", result.detected_languages.join(", "));
|
||||
|
||||
// Process multi-language content
|
||||
result.detected_languages.forEach((lang) => {
|
||||
console.log(`Language detected: ${lang}`);
|
||||
});
|
||||
|
||||
// Access metadata for language info
|
||||
if (result.metadata && result.metadata.language) {
|
||||
console.log(`Primary metadata language: ${result.metadata.language}`);
|
||||
}
|
||||
} else {
|
||||
console.log("No languages detected");
|
||||
}
|
||||
```
|
||||
33
docs/snippets/wasm/metadata/metadata-with-chunks.ts
Normal file
33
docs/snippets/wasm/metadata/metadata-with-chunks.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import type { ExtractionConfig } from "@kreuzberg/wasm";
|
||||
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
async function extractWithChunkMetadata() {
|
||||
await initWasm();
|
||||
|
||||
const bytes = new Uint8Array(await fetch("document.pdf").then((r) => r.arrayBuffer()));
|
||||
|
||||
const config: ExtractionConfig = {
|
||||
chunking: {
|
||||
maxChars: 500,
|
||||
chunkOverlap: 50,
|
||||
},
|
||||
};
|
||||
|
||||
const result = await extractBytes(bytes, "application/pdf", config);
|
||||
|
||||
console.log("Document Metadata:", result.metadata);
|
||||
|
||||
if (result.chunks) {
|
||||
result.chunks.forEach((chunk) => {
|
||||
console.log("Chunk Metadata:", {
|
||||
charStart: chunk.metadata.charStart,
|
||||
charEnd: chunk.metadata.charEnd,
|
||||
index: chunk.metadata.chunkIndex,
|
||||
total: chunk.metadata.totalChunks,
|
||||
tokens: chunk.metadata.tokenCount,
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
extractWithChunkMetadata().catch(console.error);
|
||||
85
docs/snippets/wasm/metadata/metadata.md
Normal file
85
docs/snippets/wasm/metadata/metadata.md
Normal file
@@ -0,0 +1,85 @@
|
||||
```typescript title="WASM"
|
||||
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
|
||||
|
||||
await initWasm();
|
||||
|
||||
const fileInput = document.getElementById("file") as HTMLInputElement;
|
||||
const file = fileInput.files?.[0];
|
||||
|
||||
if (file) {
|
||||
const result = await extractFromFile(file);
|
||||
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
|
||||
|
||||
// Access common metadata fields
|
||||
if (result.metadata.title) {
|
||||
console.log(`Title: ${result.metadata.title}`);
|
||||
}
|
||||
|
||||
// Access format-specific metadata
|
||||
const metadata = result.metadata;
|
||||
|
||||
// For HTML files
|
||||
if (metadata.html) {
|
||||
const htmlMeta = metadata.html;
|
||||
console.log(`HTML Title: ${htmlMeta.title}`);
|
||||
console.log(`Description: ${htmlMeta.description}`);
|
||||
|
||||
// Access keywords as array
|
||||
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
|
||||
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
|
||||
}
|
||||
|
||||
// Access canonical URL
|
||||
if (htmlMeta.canonical_url) {
|
||||
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
|
||||
}
|
||||
|
||||
// Access Open Graph fields
|
||||
if (htmlMeta.open_graph) {
|
||||
if (htmlMeta.open_graph["title"]) {
|
||||
console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
|
||||
}
|
||||
if (htmlMeta.open_graph["image"]) {
|
||||
console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access Twitter Card fields
|
||||
if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
|
||||
console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
|
||||
}
|
||||
|
||||
// Access headers
|
||||
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
|
||||
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
|
||||
}
|
||||
|
||||
// Access links
|
||||
if (htmlMeta.links && htmlMeta.links.length > 0) {
|
||||
htmlMeta.links.forEach((link: any) => {
|
||||
console.log(`Link: ${link.href} (${link.text})`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access images
|
||||
if (htmlMeta.images && htmlMeta.images.length > 0) {
|
||||
htmlMeta.images.forEach((image: any) => {
|
||||
console.log(`Image: ${image.src}`);
|
||||
});
|
||||
}
|
||||
|
||||
// Access structured data
|
||||
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
|
||||
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
|
||||
}
|
||||
}
|
||||
|
||||
// PDF-specific fields are at the top level of metadata
|
||||
if (metadata.pageCount) {
|
||||
console.log(`Pages: ${metadata.pageCount}`);
|
||||
}
|
||||
if (metadata.authors && metadata.authors.length > 0) {
|
||||
console.log(`Authors: ${metadata.authors.join(", ")}`);
|
||||
}
|
||||
}
|
||||
```
|
||||
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
36
docs/snippets/wasm/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
pages: new PageConfig({
|
||||
extract_pages: true,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.metadata && result.metadata.pages) {
|
||||
const pageStructure = result.metadata.pages;
|
||||
console.log(`Total pages: ${pageStructure.total_count}`);
|
||||
|
||||
if (pageStructure.boundaries) {
|
||||
// Iterate through page boundaries to map content to pages
|
||||
pageStructure.boundaries.forEach((boundary) => {
|
||||
const pageText = result.content.substring(
|
||||
boundary.byte_start,
|
||||
Math.min(boundary.byte_end, boundary.byte_start + 100),
|
||||
);
|
||||
|
||||
console.log(`Page ${boundary.page_number}:`);
|
||||
console.log(` Byte range: ${boundary.byte_start}-${boundary.byte_end}`);
|
||||
console.log(` Preview: ${pageText}...`);
|
||||
});
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/wasm/metadata/page_tracking_basic.md
Normal file
38
docs/snippets/wasm/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { PageConfig, ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({
|
||||
pages: new PageConfig({
|
||||
extract_pages: true,
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.pages) {
|
||||
console.log(`Total pages extracted: ${result.pages.length}`);
|
||||
|
||||
result.pages.forEach((page) => {
|
||||
console.log(`Page ${page.pageNumber}:`);
|
||||
console.log(` Content length: ${page.content.length} chars`);
|
||||
console.log(` Tables: ${page.tables.length}`);
|
||||
console.log(` Images: ${page.images.length}`);
|
||||
|
||||
// Check if page is blank
|
||||
if (page.isBlank) {
|
||||
console.log(" This page is blank");
|
||||
}
|
||||
|
||||
// Access page hierarchy if available
|
||||
if (page.hierarchy) {
|
||||
console.log(` Hierarchy level: ${page.hierarchy}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
```
|
||||
43
docs/snippets/wasm/metadata/tables.md
Normal file
43
docs/snippets/wasm/metadata/tables.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```typescript title="WASM"
|
||||
import init, { extractBytes } from "kreuzberg-wasm";
|
||||
import { ExtractionConfig } from "kreuzberg-wasm";
|
||||
|
||||
await init();
|
||||
|
||||
const fileBuffer = new Uint8Array(/* your file bytes */);
|
||||
const mimeType = "application/pdf";
|
||||
|
||||
const config = new ExtractionConfig({});
|
||||
|
||||
const result = await extractBytes(fileBuffer, mimeType, config);
|
||||
|
||||
if (result.tables && result.tables.length > 0) {
|
||||
console.log(`Found ${result.tables.length} tables`);
|
||||
|
||||
result.tables.forEach((table, index) => {
|
||||
console.log(`\nTable ${index + 1}:`);
|
||||
console.log(` Page: ${table.pageNumber}`);
|
||||
console.log(` Markdown representation:`);
|
||||
console.log(table.markdown);
|
||||
|
||||
// Access cell data
|
||||
const cells = table.cells;
|
||||
if (cells) {
|
||||
console.log(` Total cells: ${Object.keys(cells).length}`);
|
||||
|
||||
// Iterate through cells (structure depends on how cells are serialized)
|
||||
for (const rowKey of Object.keys(cells)) {
|
||||
const row = cells[rowKey];
|
||||
console.log(` Row ${rowKey}: ${JSON.stringify(row)}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Access bounding box if available
|
||||
if (table.boundingBox) {
|
||||
console.log(` Bounding box: ${table.boundingBox}`);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
console.log("No tables found in document");
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user