This commit is contained in:
60
docs/snippets/python/config/advanced_config.md
Normal file
60
docs/snippets/python/config/advanced_config.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
extract_file,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
EmbeddingModelType,
|
||||
LanguageDetectionConfig,
|
||||
TokenReductionOptions,
|
||||
PostProcessorConfig,
|
||||
KeywordConfig,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
),
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=1000,
|
||||
overlap=200,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
),
|
||||
),
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False,
|
||||
),
|
||||
keywords=KeywordConfig(
|
||||
algorithm="yake",
|
||||
max_keywords=10,
|
||||
min_score=0.1,
|
||||
ngram_range=[1, 3],
|
||||
language="en",
|
||||
),
|
||||
token_reduction=TokenReductionOptions(
|
||||
mode="moderate",
|
||||
preserve_important_words=True,
|
||||
),
|
||||
postprocessor=PostProcessorConfig(enabled=True),
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
if result.detected_languages:
|
||||
print(f"Languages: {result.detected_languages}")
|
||||
if result.chunks:
|
||||
print(f"Chunks: {len(result.chunks)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
80
docs/snippets/python/config/chunking_config.md
Normal file
80
docs/snippets/python/config/chunking_config.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Chunks: {len(result.chunks or [])}")
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Length: {len(chunk.content)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Markdown with Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
sizing_type="tokenizer",
|
||||
sizing_model="Xenova/gpt-4o",
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
heading_context = chunk.metadata.get("heading_context")
|
||||
if heading_context:
|
||||
headings = heading_context.get("headings", [])
|
||||
for h in headings:
|
||||
print(f"Heading L{h['level']}: {h['text']}")
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Semantic"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(chunker_type="semantic")
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
```python title="Python - Prepend Heading Context"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
chunker_type="markdown",
|
||||
max_chars=500,
|
||||
max_overlap=50,
|
||||
prepend_heading_context=True,
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.md", config=config)
|
||||
for chunk in result.chunks or []:
|
||||
# Each chunk's content is prefixed with its heading breadcrumb
|
||||
print(f"Content: {chunk.content[:100]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
14
docs/snippets/python/config/config_basic.md
Normal file
14
docs/snippets/python/config/config_basic.md
Normal file
@@ -0,0 +1,14 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/config/config_discover.md
Normal file
16
docs/snippets/python/config/config_discover.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig()
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
|
||||
content: str = result.content
|
||||
content_preview: str = content[:100]
|
||||
|
||||
print(f"Content preview: {content_preview}")
|
||||
print(f"Total length: {len(content)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
16
docs/snippets/python/config/config_ocr.md
Normal file
16
docs/snippets/python/config/config_ocr.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract", language="eng+fra",
|
||||
tesseract_config=TesseractConfig(psm=3)
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
18
docs/snippets/python/config/config_programmatic.md
Normal file
18
docs/snippets/python/config/config_programmatic.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
ChunkingConfig,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
use_cache=True,
|
||||
ocr=OcrConfig(backend="tesseract", language="eng"),
|
||||
chunking=ChunkingConfig(max_chars=1000, max_overlap=200),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
content_length: int = len(result.content)
|
||||
print(f"Content length: {content_length}")
|
||||
```
|
||||
15
docs/snippets/python/config/document_structure_config.md
Normal file
15
docs/snippets/python/config/document_structure_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Document Structure Config (Python)"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
# Enable document structure output
|
||||
config = ExtractionConfig(include_document_structure=True)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access the document tree
|
||||
if result.document:
|
||||
for node in result.document["nodes"]:
|
||||
node_type = node["content"]["node_type"]
|
||||
text = node["content"].get("text", "")
|
||||
print(f"[{node_type}] {text[:80]}")
|
||||
```
|
||||
29
docs/snippets/python/config/element_based_output.md
Normal file
29
docs/snippets/python/config/element_based_output.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```python title="Element-Based Output (Python)"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
# Configure element-based output
|
||||
config = ExtractionConfig(result_format="element_based")
|
||||
|
||||
# Extract document
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access elements
|
||||
for element in result.elements:
|
||||
print(f"Type: {element.element_type}")
|
||||
print(f"Text: {element.text[:100]}")
|
||||
|
||||
if element.metadata.page_number:
|
||||
print(f"Page: {element.metadata.page_number}")
|
||||
|
||||
if element.metadata.coordinates:
|
||||
coords = element.metadata.coordinates
|
||||
print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
|
||||
|
||||
print("---")
|
||||
|
||||
# Filter by element type
|
||||
titles = [e for e in result.elements if e.element_type == "title"]
|
||||
for title in titles:
|
||||
level = title.metadata.additional.get("level", "unknown")
|
||||
print(f"[{level}] {title.text}")
|
||||
```
|
||||
15
docs/snippets/python/config/embedding_config.md
Normal file
15
docs/snippets/python/config/embedding_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, ChunkingConfig, EmbeddingConfig, EmbeddingModelType
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
embedding=EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("all-mpnet-base-v2"),
|
||||
batch_size=16,
|
||||
normalize=True,
|
||||
show_download_progress=True
|
||||
)
|
||||
)
|
||||
)
|
||||
```
|
||||
83
docs/snippets/python/config/embedding_config.py
Normal file
83
docs/snippets/python/config/embedding_config.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from kreuzberg import EmbeddingConfig, EmbeddingModelType, ChunkingConfig, ExtractionConfig
|
||||
|
||||
# Example 1: Preset model (recommended)
|
||||
# Fast, balanced, or quality preset configurations optimized for common use cases.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.preset("balanced"),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir="~/.cache/kreuzberg/embeddings",
|
||||
)
|
||||
|
||||
# Available presets:
|
||||
# - "fast" (384 dims): Quick prototyping, development, resource-constrained
|
||||
# - "balanced" (768 dims): Production, general-purpose RAG, English documents
|
||||
# - "quality" (1024 dims): Complex documents, maximum accuracy
|
||||
# - "multilingual" (768 dims): International documents, 100+ languages
|
||||
|
||||
|
||||
# Example 2: Custom ONNX model (requires embeddings feature)
|
||||
# Direct access to specific ONNX embedding models from HuggingFace with custom dimensions.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.custom(
|
||||
model_id="BAAI/bge-small-en-v1.5",
|
||||
dimensions=384,
|
||||
),
|
||||
batch_size=32,
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir=None, # Uses default: .kreuzberg/embeddings/
|
||||
)
|
||||
|
||||
# Popular ONNX-compatible models:
|
||||
# - "BAAI/bge-small-en-v1.5" (384 dims): Fast, efficient
|
||||
# - "BAAI/bge-base-en-v1.5" (768 dims): Balanced quality/speed
|
||||
# - "BAAI/bge-large-en-v1.5" (1024 dims): High quality, slower
|
||||
# - "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" (768 dims): Multilingual support
|
||||
|
||||
|
||||
# Example 3: Alternative Custom Model
|
||||
# For advanced users wanting alternative ONNX embedding models.
|
||||
embedding_config = EmbeddingConfig(
|
||||
model=EmbeddingModelType.custom(
|
||||
model_id="sentence-transformers/all-mpnet-base-v2",
|
||||
dimensions=768,
|
||||
),
|
||||
batch_size=16, # Larger model requires smaller batch size
|
||||
normalize=True,
|
||||
show_download_progress=True,
|
||||
cache_dir="/var/cache/embeddings",
|
||||
)
|
||||
|
||||
|
||||
# Integration with ChunkingConfig
|
||||
# Add embeddings to your chunking configuration:
|
||||
chunking_with_embeddings = ChunkingConfig(
|
||||
max_chars=1024,
|
||||
max_overlap=100,
|
||||
preset="balanced",
|
||||
embedding=EmbeddingConfig(), # Uses balanced preset
|
||||
)
|
||||
|
||||
extraction_config = ExtractionConfig(
|
||||
chunking=chunking_with_embeddings,
|
||||
)
|
||||
|
||||
|
||||
# Key parameter explanations:
|
||||
#
|
||||
# batch_size: Number of texts to embed at once (32-128 typical)
|
||||
# - Larger batches are faster but use more memory
|
||||
# - Smaller batches for resource-constrained environments
|
||||
#
|
||||
# normalize: Whether to normalize vectors (L2 norm)
|
||||
# - True (recommended): Enables cosine similarity in vector DBs
|
||||
# - False: Raw embedding values
|
||||
#
|
||||
# cache_dir: Where to store downloaded models
|
||||
# - None: Uses .kreuzberg/embeddings/ in current directory
|
||||
# - String path: Custom directory for model storage
|
||||
#
|
||||
# show_download_progress: Display download progress bar
|
||||
# - Useful for monitoring large model downloads
|
||||
72
docs/snippets/python/config/hierarchy_config.py
Normal file
72
docs/snippets/python/config/hierarchy_config.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from kreuzberg import PdfConfig, HierarchyConfig, ExtractionConfig, extract_file_sync
|
||||
|
||||
# Example 1: Basic hierarchy extraction
|
||||
# Enabled with default k_clusters=6 for standard H1-H6 heading hierarchy.
|
||||
# Extract bounding box information for spatial layout awareness.
|
||||
hierarchy_config_basic = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6, # Default: creates 6 font size clusters (H1-H6 structure)
|
||||
include_bbox=True, # Include bounding box coordinates
|
||||
ocr_coverage_threshold=None # No OCR coverage threshold
|
||||
)
|
||||
|
||||
pdf_config_basic = PdfConfig(hierarchy=hierarchy_config_basic)
|
||||
extraction_config_basic = ExtractionConfig(pdf_options=pdf_config_basic)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_basic)
|
||||
|
||||
|
||||
# Example 2: Custom k_clusters for minimal structure
|
||||
# Use 3 clusters for simpler hierarchy with minimal structure.
|
||||
# Useful when you only need major section divisions (Main, Subsection, Detail).
|
||||
hierarchy_config_minimal = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=3, # Minimal clustering: just 3 levels
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=None
|
||||
)
|
||||
|
||||
pdf_config_minimal = PdfConfig(hierarchy=hierarchy_config_minimal)
|
||||
extraction_config_minimal = ExtractionConfig(pdf_options=pdf_config_minimal)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_minimal)
|
||||
|
||||
|
||||
# Example 3: With OCR coverage threshold
|
||||
# Trigger OCR if less than 50% of text has font data.
|
||||
# Useful for documents with mixed digital and scanned content.
|
||||
hierarchy_config_ocr = HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6,
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=0.5 # Trigger OCR if text coverage < 50%
|
||||
)
|
||||
|
||||
pdf_config_ocr = PdfConfig(hierarchy=hierarchy_config_ocr)
|
||||
extraction_config_ocr = ExtractionConfig(pdf_options=pdf_config_ocr)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=extraction_config_ocr)
|
||||
|
||||
|
||||
# Field descriptions:
|
||||
#
|
||||
# enabled: bool (default: True)
|
||||
# - Enable or disable hierarchy extraction
|
||||
# - When False, hierarchy structure is not analyzed
|
||||
#
|
||||
# k_clusters: int (default: 6, valid: 1-7)
|
||||
# - Number of font size clusters for hierarchy levels
|
||||
# - 6 provides H1-H6 heading levels with body text
|
||||
# - Higher values create more fine-grained hierarchy
|
||||
# - Lower values create simpler structure
|
||||
#
|
||||
# include_bbox: bool (default: True)
|
||||
# - Include bounding box coordinates in hierarchy blocks
|
||||
# - Required for spatial layout awareness and document structure
|
||||
# - Set to False only if space optimization is critical
|
||||
#
|
||||
# ocr_coverage_threshold: float | None (default: None)
|
||||
# - Range: 0.0 to 1.0
|
||||
# - Triggers OCR when text block coverage falls below this fraction
|
||||
# - Example: 0.5 means "run OCR if less than 50% of page has text data"
|
||||
# - None means no OCR coverage-based triggering
|
||||
17
docs/snippets/python/config/html_output.md
Normal file
17
docs/snippets/python/config/html_output.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
output_format="html",
|
||||
html_output={
|
||||
"theme": "github",
|
||||
"embed_css": True,
|
||||
},
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content) # HTML with kb-* classes and GitHub theme
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
75
docs/snippets/python/config/keyword_config.py
Normal file
75
docs/snippets/python/config/keyword_config.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
YakeParams,
|
||||
RakeParams,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
|
||||
# Example 1: Basic YAKE configuration
|
||||
# Uses YAKE algorithm with default parameters and English stopword filtering
|
||||
async def basic_yake() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.0,
|
||||
ngram_range=(1, 3),
|
||||
language="en",
|
||||
yake_params=None,
|
||||
rake_params=None,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
# Example 2: Advanced YAKE with custom parameters
|
||||
# Fine-tunes YAKE with custom window size for co-occurrence analysis
|
||||
async def advanced_yake() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=15,
|
||||
min_score=0.1,
|
||||
ngram_range=(1, 2),
|
||||
language="en",
|
||||
yake_params=YakeParams(
|
||||
window_size=1,
|
||||
),
|
||||
rake_params=None,
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
# Example 3: RAKE configuration
|
||||
# Uses RAKE algorithm for rapid keyword extraction with phrase constraints
|
||||
async def rake_config() -> None:
|
||||
config = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.RAKE,
|
||||
max_keywords=10,
|
||||
min_score=5.0,
|
||||
ngram_range=(1, 3),
|
||||
language="en",
|
||||
yake_params=None,
|
||||
rake_params=RakeParams(
|
||||
min_word_length=1,
|
||||
max_words_per_phrase=3,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Keywords: {result.keywords}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(basic_yake())
|
||||
24
docs/snippets/python/config/keyword_extraction_config.md
Normal file
24
docs/snippets/python/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import (
|
||||
ExtractionConfig,
|
||||
KeywordConfig,
|
||||
KeywordAlgorithm,
|
||||
extract_file,
|
||||
)
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
keywords=KeywordConfig(
|
||||
algorithm=KeywordAlgorithm.YAKE,
|
||||
max_keywords=10,
|
||||
min_score=0.3,
|
||||
ngram_range=(1, 3),
|
||||
language="en"
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content extracted: {len(result.content)} chars")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
19
docs/snippets/python/config/language_detection_config.md
Normal file
19
docs/snippets/python/config/language_detection_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.85,
|
||||
detect_multiple=False
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
if result.detected_languages:
|
||||
print(f"Primary language: {result.detected_languages[0]}")
|
||||
print(f"Content length: {len(result.content)} chars")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
26
docs/snippets/python/config/ocr_dpi_config.md
Normal file
26
docs/snippets/python/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
ImagePreprocessingConfig,
|
||||
)
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
tesseract_config=TesseractConfig(
|
||||
preprocessing=ImagePreprocessingConfig(target_dpi=300),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("scanned.pdf", config=config)
|
||||
|
||||
content_length: int = len(result.content)
|
||||
table_count: int = len(result.tables)
|
||||
|
||||
print(f"Content length: {content_length} characters")
|
||||
print(f"Tables detected: {table_count}")
|
||||
```
|
||||
18
docs/snippets/python/config/pdf_config.md
Normal file
18
docs/snippets/python/config/pdf_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, PdfConfig, HierarchyConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
extract_images=True,
|
||||
extract_metadata=True,
|
||||
passwords=["password1", "password2"],
|
||||
hierarchy=HierarchyConfig(enabled=True, k_clusters=6)
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
22
docs/snippets/python/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
extract_metadata=True,
|
||||
hierarchy=HierarchyConfig(
|
||||
enabled=True,
|
||||
k_clusters=6,
|
||||
include_bbox=True,
|
||||
ocr_coverage_threshold=0.8
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
# Access hierarchy information
|
||||
for page in result.pages or []:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f" Content: {page.content[:100]}...")
|
||||
```
|
||||
16
docs/snippets/python/config/postprocessor_config.md
Normal file
16
docs/snippets/python/config/postprocessor_config.md
Normal file
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, PostProcessorConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
postprocessor=PostProcessorConfig(
|
||||
enabled=True,
|
||||
enabled_processors=["deduplication"],
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
15
docs/snippets/python/config/quality_processing_config.md
Normal file
15
docs/snippets/python/config/quality_processing_config.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
|
||||
quality_score: float = result.quality_score or 0.0
|
||||
print(f"Quality score: {quality_score:.2f}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
21
docs/snippets/python/config/tesseract_config.md
Normal file
21
docs/snippets/python/config/tesseract_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
|
||||
|
||||
async def main() -> None:
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
language="eng+fra+deu",
|
||||
tesseract_config=TesseractConfig(
|
||||
psm=6,
|
||||
oem=1,
|
||||
min_confidence=0.8,
|
||||
enable_table_detection=True,
|
||||
),
|
||||
)
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(f"Content: {result.content[:100]}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
10
docs/snippets/python/config/token_reduction_config.md
Normal file
10
docs/snippets/python/config/token_reduction_config.md
Normal file
@@ -0,0 +1,10 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, TokenReductionConfig
|
||||
|
||||
config: ExtractionConfig = ExtractionConfig(
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate",
|
||||
preserve_important_words=True,
|
||||
)
|
||||
)
|
||||
```
|
||||
Reference in New Issue
Block a user