Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/python/advanced/vector_database_integration.md
+++ b/docs/snippets/python/advanced/vector_database_integration.md
@@ -0,0 +1,59 @@
+```python title="Python"
+import asyncio
+from dataclasses import dataclass, field
+from kreuzberg import (
+    ExtractionConfig,
+    ChunkingConfig,
+    EmbeddingConfig,
+    EmbeddingModelType,
+    extract_file,
+)
+
+
+@dataclass
+class VectorRecord:
+    id: str
+    content: str
+    embedding: list[float]
+    metadata: dict[str, str] = field(default_factory=dict)
+
+
+async def extract_and_vectorize(
+    document_path: str,
+    document_id: str,
+) -> list[VectorRecord]:
+    config: ExtractionConfig = ExtractionConfig(
+        chunking=ChunkingConfig(
+            max_characters=512,
+            overlap=50,
+            embedding=EmbeddingConfig(
+                model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
+                normalize=True,
+                batch_size=32,
+            ),
+        )
+    )
+
+    result = await extract_file(document_path, config=config)
+
+    records: list[VectorRecord] = []
+    for index, chunk in enumerate(result.chunks or []):
+        if chunk.embedding is None:
+            continue
+        records.append(
+            VectorRecord(
+                id=f"{document_id}_chunk_{index}",
+                content=chunk.content,
+                embedding=chunk.embedding,
+                metadata={
+                    "document_id": document_id,
+                    "chunk_index": str(index),
+                    "content_length": str(len(chunk.content)),
+                },
+            )
+        )
+    return records
+
+
+asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
+```