This commit is contained in:
29
docs/snippets/python/advanced/advanced_config.md
Normal file
29
docs/snippets/python/advanced/advanced_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
TokenReductionConfig,
|
||||
LanguageDetectionConfig,
|
||||
)
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
|
||||
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
|
||||
token_reduction=TokenReductionConfig(mode="light"),
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True, detect_multiple=True
|
||||
),
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Chunk: {chunk.content[:100]}")
|
||||
|
||||
if result.detected_languages:
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
```
|
||||
18
docs/snippets/python/advanced/chunk_page_mapping.md
Normal file
18
docs/snippets/python/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(max_characters=500, overlap=50),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
if result.chunks:
|
||||
for chunk in result.chunks:
|
||||
first = chunk.metadata.first_page
|
||||
last = chunk.metadata.last_page
|
||||
if first is None:
|
||||
continue
|
||||
page_range = f"Page {first}" if first == last else f"Pages {first}-{last}"
|
||||
print(f"Chunk: {chunk.content[:50]}... ({page_range})")
|
||||
```
|
||||
59
docs/snippets/python/advanced/chunking_config.md
Normal file
59
docs/snippets/python/advanced/chunking_config.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1000,
|
||||
overlap=200,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Length: {len(chunk.content)}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Semantic"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(chunker_type="semantic")
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Prepend Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_characters=500,
|
||||
overlap=50,
|
||||
prepend_heading_context=True,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
39
docs/snippets/python/advanced/chunking_rag.md
Normal file
39
docs/snippets/python/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=500,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
for chunk in result.chunks or []:
|
||||
print(
|
||||
f"Chunk {chunk.metadata.chunk_index + 1}/{chunk.metadata.total_chunks}"
|
||||
)
|
||||
print(
|
||||
f"Position: {chunk.metadata.byte_start}-{chunk.metadata.byte_end}"
|
||||
)
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
if chunk.embedding is not None:
|
||||
print(f"Embedding: {len(chunk.embedding)} dimensions")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
34
docs/snippets/python/advanced/combining_all_features.md
Normal file
34
docs/snippets/python/advanced/combining_all_features.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
LanguageDetectionConfig,
|
||||
TokenReductionConfig,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
language_detection=LanguageDetectionConfig(enabled=True),
|
||||
token_reduction=TokenReductionConfig(mode="moderate"),
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=512,
|
||||
max_overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"), normalize=True
|
||||
),
|
||||
),
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
quality = result.quality_score or 0
|
||||
print(f"Quality: {quality:.2f}")
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
if result.chunks:
|
||||
print(f"Chunks: {len(result.chunks)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
36
docs/snippets/python/advanced/complete_example.md
Normal file
36
docs/snippets/python/advanced/complete_example.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
PdfConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng+fra",
|
||||
tesseract_config=TesseractConfig(psm=3),
|
||||
),
|
||||
pdf_options=PdfConfig(extract_images=True),
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-MiniLM-L6-v2")
|
||||
),
|
||||
),
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/advanced/embedding_with_chunking.md
Normal file
21
docs/snippets/python/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1024,
|
||||
overlap=100,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
show_download_progress=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
13
docs/snippets/python/advanced/keyword_extraction_config.md
Normal file
13
docs/snippets/python/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, KeywordConfig, KeywordAlgorithm
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
ngram_range=[1, 3],
|
||||
language="en",
|
||||
)
|
||||
)
|
||||
```
|
||||
27
docs/snippets/python/advanced/keyword_extraction_example.md
Normal file
27
docs/snippets/python/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
for keyword in result.extracted_keywords or []:
|
||||
print(f"{keyword.text}: {keyword.score:.3f}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
11
docs/snippets/python/advanced/language_detection_config.md
Normal file
11
docs/snippets/python/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False,
|
||||
)
|
||||
)
|
||||
```
|
||||
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=True,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("multilingual_document.pdf", config=config)
|
||||
|
||||
print(f"Detected languages: {result.detected_languages}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
@@ -0,0 +1,7 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
```
|
||||
21
docs/snippets/python/advanced/quality_processing_example.md
Normal file
21
docs/snippets/python/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True,
|
||||
)
|
||||
|
||||
result = await extract_file("scanned_document.pdf", config=config)
|
||||
|
||||
if result.quality_score is not None:
|
||||
if result.quality_score < 0.5:
|
||||
print(f"Warning: Low quality extraction ({result.quality_score:.2f})")
|
||||
else:
|
||||
print(f"Quality score: {result.quality_score:.2f}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
12
docs/snippets/python/advanced/token_reduction_config.md
Normal file
12
docs/snippets/python/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, TokenReductionConfig, ReductionLevel
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
level=ReductionLevel.MODERATE,
|
||||
preserve_markdown=True,
|
||||
preserve_code=True,
|
||||
language_hint="eng",
|
||||
)
|
||||
)
|
||||
```
|
||||
25
docs/snippets/python/advanced/token_reduction_example.md
Normal file
25
docs/snippets/python/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
TokenReductionConfig,
|
||||
ReductionLevel,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
level=ReductionLevel.MODERATE,
|
||||
preserve_markdown=True,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("verbose_document.pdf", config=config)
|
||||
|
||||
print(f"Reduced content length: {len(result.content)} chars")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
59
docs/snippets/python/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from dataclasses import dataclass, field
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VectorRecord:
|
||||
id: str
|
||||
content: str
|
||||
embedding: list[float]
|
||||
metadata: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
async def extract_and_vectorize(
|
||||
document_path: str,
|
||||
document_id: str,
|
||||
) -> list[VectorRecord]:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType({"type": "preset", "name": "balanced"}),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file(document_path, config=config)
|
||||
|
||||
records: list[VectorRecord] = []
|
||||
for index, chunk in enumerate(result.chunks or []):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append(
|
||||
VectorRecord(
|
||||
id=f"{document_id}_chunk_{index}",
|
||||
content=chunk.content,
|
||||
embedding=chunk.embedding,
|
||||
metadata={
|
||||
"document_id": document_id,
|
||||
"chunk_index": str(index),
|
||||
"content_length": str(len(chunk.content)),
|
||||
},
|
||||
)
|
||||
)
|
||||
return records
|
||||
|
||||
|
||||
asyncio.run(extract_and_vectorize("document.pdf", "doc_001"))
|
||||
```
|
||||
Reference in New Issue
Block a user