Files
fil/docs/snippets/python/config/chunking_config.md

81 lines
2.4 KiB
Markdown
Raw Permalink Normal View History

2026-06-01 23:40:55 +02:00
```python title="Python"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Chunks: {len(result.chunks or [])}")
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
```
```python title="Python - Markdown with Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
sizing_type="tokenizer",
sizing_model="Xenova/gpt-4o",
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
heading_context = chunk.metadata.get("heading_context")
if heading_context:
headings = heading_context.get("headings", [])
for h in headings:
print(f"Heading L{h['level']}: {h['text']}")
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Semantic"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(chunker_type="semantic")
)
result = await extract_file("document.pdf", config=config)
for chunk in result.chunks or []:
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```
```python title="Python - Prepend Heading Context"
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
```