This commit is contained in:
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
preset="balanced",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
records: list[dict] = []
|
||||
if result.chunks:
|
||||
for index, chunk in enumerate(result.chunks):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append({
|
||||
"id": f"document_chunk_{index}",
|
||||
"content": chunk.content,
|
||||
"embedding": chunk.embedding,
|
||||
"metadata": {
|
||||
"document_id": "document.pdf",
|
||||
"chunk_index": index,
|
||||
"content_length": len(chunk.content),
|
||||
},
|
||||
})
|
||||
|
||||
print(f"Prepared {len(records)} vector records")
|
||||
```
|
||||
Reference in New Issue
Block a user