Files
fil/docs/snippets/python/advanced/vector_database_integration.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1.5 KiB

import asyncio
from dataclasses import dataclass, field
from kreuzberg import (
    ExtractionConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
    extract_file,
)


@dataclass
class VectorRecord:
    id: str
    content: str
    embedding: list[float]
    metadata: dict[str, str] = field(default_factory=dict)


async def extract_and_vectorize(
    document_path: str,
    document_id: str,
) -> list[VectorRecord]:
    config: ExtractionConfig = ExtractionConfig(
        chunking=ChunkingConfig(
            max_characters=512,
            overlap=50,
            embedding=EmbeddingConfig(
                model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
                normalize=True,
                batch_size=32,
            ),
        )
    )

    result = await extract_file(document_path, config=config)

    records: list[VectorRecord] = []
    for index, chunk in enumerate(result.chunks or []):
        if chunk.embedding is None:
            continue
        records.append(
            VectorRecord(
                id=f"{document_id}_chunk_{index}",
                content=chunk.content,
                embedding=chunk.embedding,
                metadata={
                    "document_id": document_id,
                    "chunk_index": str(index),
                    "content_length": str(len(chunk.content)),
                },
            )
        )
    return records


asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))