This commit is contained in:
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from dataclasses import dataclass, field
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VectorRecord:
|
||||
id: str
|
||||
content: str
|
||||
embedding: list[float]
|
||||
metadata: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
async def extract_and_vectorize(
|
||||
document_path: str,
|
||||
document_id: str,
|
||||
) -> list[VectorRecord]:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file(document_path, config=config)
|
||||
|
||||
records: list[VectorRecord] = []
|
||||
for index, chunk in enumerate(result.chunks or []):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append(
|
||||
VectorRecord(
|
||||
id=f"{document_id}_chunk_{index}",
|
||||
content=chunk.content,
|
||||
embedding=chunk.embedding,
|
||||
metadata={
|
||||
"document_id": document_id,
|
||||
"chunk_index": str(index),
|
||||
"content_length": str(len(chunk.content)),
|
||||
},
|
||||
)
|
||||
)
|
||||
return records
|
||||
|
||||
|
||||
asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
|
||||
```
|
||||
Reference in New Issue
Block a user