docs/snippets/python/advanced/vector_database_integration.md

```python title="Python"
import asyncio
from dataclasses import dataclass, field
from kreuzberg import (
    ExtractionConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
    extract_file,
)


@dataclass
class VectorRecord:
    id: str
    content: str
    embedding: list[float]
    metadata: dict[str, str] = field(default_factory=dict)


async def extract_and_vectorize(
    document_path: str,
    document_id: str,
) -> list[VectorRecord]:
    config: ExtractionConfig = ExtractionConfig(
        chunking=ChunkingConfig(
            max_characters=512,
            overlap=50,
            embedding=EmbeddingConfig(
                model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
                normalize=True,
                batch_size=32,
            ),
        )
    )

    result = await extract_file(document_path, config=config)

    records: list[VectorRecord] = []
    for index, chunk in enumerate(result.chunks or []):
        if chunk.embedding is None:
            continue
        records.append(
            VectorRecord(
                id=f"{document_id}_chunk_{index}",
                content=chunk.content,
                embedding=chunk.embedding,
                metadata={
                    "document_id": document_id,
                    "chunk_index": str(index),
                    "content_length": str(len(chunk.content)),
                },
            )
        )
    return records


asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
```
Nomad changes 2026-06-01 23:40:55 +02:00			```python title="Python"
			`import asyncio`
			`from dataclasses import dataclass, field`
			`from kreuzberg import (`
			`ExtractionConfig,`
			`ChunkingConfig,`
			`EmbeddingConfig,`
			`EmbeddingModelType,`
			`extract_file,`
			`)`


			`@dataclass`
			`class VectorRecord:`
			`id: str`
			`content: str`
			`embedding: list[float]`
			`metadata: dict[str, str] = field(default_factory=dict)`


			`async def extract_and_vectorize(`
			`document_path: str,`
			`document_id: str,`
			`) -> list[VectorRecord]:`
			`config: ExtractionConfig = ExtractionConfig(`
			`chunking=ChunkingConfig(`
			`max_characters=512,`
			`overlap=50,`
			`embedding=EmbeddingConfig(`
			`model=EmbeddingModelType({"type": "preset", "name": "balanced"}),`
			`normalize=True,`
			`batch_size=32,`
			`),`
			`)`
			`)`

			`result = await extract_file(document_path, config=config)`

			`records: list[VectorRecord] = []`
			`for index, chunk in enumerate(result.chunks or []):`
			`if chunk.embedding is None:`
			`continue`
			`records.append(`
			`VectorRecord(`
			`id=f"{document_id}_chunk_{index}",`
			`content=chunk.content,`
			`embedding=chunk.embedding,`
			`metadata={`
			`"document_id": document_id,`
			`"chunk_index": str(index),`
			`"content_length": str(len(chunk.content)),`
			`},`
			`)`
			`)`
			`return records`


			`asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))`
			```