This commit is contained in:
27
docs/snippets/python/llm/structured_extraction.md
Normal file
27
docs/snippets/python/llm/structured_extraction.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, StructuredExtractionConfig, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
structured_extraction=StructuredExtractionConfig(
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"authors": {"type": "array", "items": {"type": "string"}},
|
||||
"date": {"type": "string"},
|
||||
},
|
||||
"required": ["title", "authors", "date"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
llm=LlmConfig(model="openai/gpt-4o-mini"),
|
||||
strict=True,
|
||||
),
|
||||
)
|
||||
result = await extract_file("paper.pdf", config=config)
|
||||
print(result.structured_output)
|
||||
# {"title": "...", "authors": ["..."], "date": "..."}
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/llm/vlm_embeddings.md
Normal file
16
docs/snippets/python/llm/vlm_embeddings.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import embed, EmbeddingConfig, EmbeddingModelType, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.llm(
|
||||
LlmConfig(model="openai/text-embedding-3-small")
|
||||
),
|
||||
normalize=True,
|
||||
)
|
||||
embeddings = await embed(["Hello world"], config=config)
|
||||
print(len(embeddings[0])) # 1536
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
17
docs/snippets/python/llm/vlm_ocr.md
Normal file
17
docs/snippets/python/llm/vlm_ocr.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, LlmConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
force_ocr=True,
|
||||
ocr=OcrConfig(
|
||||
backend="vlm",
|
||||
vlm_config=LlmConfig(model="openai/gpt-4o-mini"),
|
||||
),
|
||||
)
|
||||
result = await extract_file("scan.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
Reference in New Issue
Block a user