This commit is contained in:
18
docs/snippets/python/utils/chunking.md
Normal file
18
docs/snippets/python/utils/chunking.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1500,
|
||||
max_overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-minilm-l6-v2")
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
36
docs/snippets/python/utils/chunking_rag.md
Normal file
36
docs/snippets/python/utils/chunking_rag.md
Normal file
@@ -0,0 +1,36 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
normalize=True,
|
||||
batch_size=16
|
||||
)
|
||||
)
|
||||
)
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
chunks_with_embeddings: list = []
|
||||
for chunk in result.chunks or []:
|
||||
if chunk.embedding:
|
||||
chunks_with_embeddings.append({
|
||||
"content": chunk.content[:100],
|
||||
"embedding_dims": len(chunk.embedding)
|
||||
})
|
||||
|
||||
print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/utils/embedding_with_chunking.md
Normal file
21
docs/snippets/python/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1024,
|
||||
max_overlap=100,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
show_download_progress=False,
|
||||
),
|
||||
)
|
||||
)
|
||||
```
|
||||
33
docs/snippets/python/utils/error_handling.md
Normal file
33
docs/snippets/python/utils/error_handling.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
|
||||
from kreuzberg import (
|
||||
KreuzbergError,
|
||||
ParsingError,
|
||||
OCRError,
|
||||
ValidationError,
|
||||
)
|
||||
|
||||
try:
|
||||
result = extract_file_sync("document.pdf")
|
||||
print(f"Extracted {len(result.content)} characters")
|
||||
except FileNotFoundError as e:
|
||||
print(f"File not found: {e}")
|
||||
except ParsingError as e:
|
||||
print(f"Failed to parse document: {e}")
|
||||
except OCRError as e:
|
||||
print(f"OCR processing failed: {e}")
|
||||
except KreuzbergError as e:
|
||||
print(f"Extraction error: {e}")
|
||||
|
||||
try:
|
||||
config: ExtractionConfig = ExtractionConfig()
|
||||
pdf_bytes: bytes = b"%PDF-1.4\n"
|
||||
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
|
||||
print(f"Extracted: {result.content[:100]}")
|
||||
except ValidationError as e:
|
||||
print(f"Invalid configuration: {e}")
|
||||
except OCRError as e:
|
||||
print(f"OCR failed: {e}")
|
||||
except KreuzbergError as e:
|
||||
print(f"Extraction failed: {e}")
|
||||
```
|
||||
19
docs/snippets/python/utils/error_handling_extract.md
Normal file
19
docs/snippets/python/utils/error_handling_extract.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import httpx
|
||||
|
||||
try:
|
||||
with httpx.Client() as client:
|
||||
with open("document.pdf", "rb") as f:
|
||||
files: dict = {"files": f}
|
||||
response: httpx.Response = client.post(
|
||||
"http://localhost:8000/extract", files=files
|
||||
)
|
||||
response.raise_for_status()
|
||||
results: list = response.json()
|
||||
print(f"Extracted {len(results)} documents")
|
||||
except httpx.HTTPStatusError as e:
|
||||
error: dict = e.response.json()
|
||||
error_type: str = error.get("error_type", "Unknown")
|
||||
message: str = error.get("message", "No message")
|
||||
print(f"Error: {error_type}: {message}")
|
||||
```
|
||||
19
docs/snippets/python/utils/image_extraction.md
Normal file
19
docs/snippets/python/utils/image_extraction.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ImageExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
images=ImageExtractionConfig(
|
||||
extract_images=True,
|
||||
target_dpi=200,
|
||||
max_image_dimension=2048,
|
||||
inject_placeholders=True, # set to False to extract images without markdown references
|
||||
auto_adjust_dpi=True,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Extracted: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
29
docs/snippets/python/utils/image_preprocessing.md
Normal file
29
docs/snippets/python/utils/image_preprocessing.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
ImagePreprocessingConfig,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
tesseract_config=TesseractConfig(
|
||||
preprocessing=ImagePreprocessingConfig(
|
||||
target_dpi=300,
|
||||
denoise=True,
|
||||
deskew=True,
|
||||
contrast_enhance=True,
|
||||
binarization_method="otsu",
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
result = await extract_file("scanned.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
22
docs/snippets/python/utils/keyword_extraction_example.md
Normal file
22
docs/snippets/python/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3
|
||||
)
|
||||
)
|
||||
result = await extract_file("research_paper.pdf", config=config)
|
||||
|
||||
keywords: list = result.extracted_keywords or []
|
||||
for kw in keywords:
|
||||
score: float = kw.score or 0.0
|
||||
text: str = kw.text or ""
|
||||
print(f"{text}: {score:.3f}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
15
docs/snippets/python/utils/language_detection.md
Normal file
15
docs/snippets/python/utils/language_detection.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True, min_confidence=0.9, detect_multiple=True
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.7,
|
||||
detect_multiple=True
|
||||
)
|
||||
)
|
||||
result = await extract_file("multilingual_document.pdf", config=config)
|
||||
languages: list[str] = result.detected_languages or []
|
||||
print(f"Detected {len(languages)} languages: {languages}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
14
docs/snippets/python/utils/quality_processing_example.md
Normal file
14
docs/snippets/python/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
config = ExtractionConfig(enable_quality_processing=True)
|
||||
result = extract_file("scanned_document.pdf", config=config)
|
||||
|
||||
quality_score = result.quality_score or 0.0
|
||||
|
||||
if quality_score < 0.5:
|
||||
print(f"Warning: Low quality extraction ({quality_score:.2f})")
|
||||
print("Consider re-scanning with higher DPI or adjusting OCR settings")
|
||||
else:
|
||||
print(f"Quality score: {quality_score:.2f}")
|
||||
```
|
||||
19
docs/snippets/python/utils/standalone_embed.md
Normal file
19
docs/snippets/python/utils/standalone_embed.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import embed_sync, embed, EmbeddingConfig, EmbeddingModelType
|
||||
|
||||
# Synchronous
|
||||
embeddings = embed_sync(
|
||||
["Hello, world!", "Kreuzberg is fast"],
|
||||
config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
|
||||
)
|
||||
assert len(embeddings) == 2
|
||||
assert len(embeddings[0]) == 768
|
||||
|
||||
# Asynchronous
|
||||
async def main():
|
||||
embeddings = await embed(
|
||||
["Hello, world!", "Kreuzberg is fast"],
|
||||
config=EmbeddingConfig(model=EmbeddingModelType.preset("balanced"), normalize=True),
|
||||
)
|
||||
assert len(embeddings) == 2
|
||||
```
|
||||
12
docs/snippets/python/utils/tables.md
Normal file
12
docs/snippets/python/utils/tables.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
for table in result.tables:
|
||||
row_count: int = len(table.cells)
|
||||
print(f"Table with {row_count} rows")
|
||||
print(table.markdown)
|
||||
for row in table.cells:
|
||||
print(row)
|
||||
```
|
||||
15
docs/snippets/python/utils/token_reduction.md
Normal file
15
docs/snippets/python/utils/token_reduction.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, TokenReductionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate", preserve_important_words=True
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content length: {len(result.content)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
19
docs/snippets/python/utils/token_reduction_example.md
Normal file
19
docs/snippets/python/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, TokenReductionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate", preserve_important_words=True
|
||||
)
|
||||
)
|
||||
result = await extract_file("verbose_document.pdf", config=config)
|
||||
original: int = result.metadata.get("original_token_count", 0)
|
||||
reduced: int = result.metadata.get("token_count", 0)
|
||||
ratio: float = result.metadata.get("token_reduction_ratio", 0.0)
|
||||
print(f"Reduced from {original} to {reduced} tokens")
|
||||
print(f"Reduction: {ratio * 100:.1f}%")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
28
docs/snippets/python/utils/vector_database_integration.md
Normal file
28
docs/snippets/python/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=512,
|
||||
max_overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"), normalize=True
|
||||
),
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
chunks = result.chunks or []
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_id: str = f"doc_chunk_{i}"
|
||||
print(f"Chunk {chunk_id}: {chunk.content[:50]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
Reference in New Issue
Block a user