1.3 KiB
1.3 KiB
from kreuzberg import (
ExtractionConfig,
OcrConfig,
ChunkingConfig,
ChunkerType,
ImageExtractionConfig,
OutputFormat,
extract_file_sync,
)
config = ExtractionConfig(
# OCR: extract text from images, fallback to Tesseract
ocr=OcrConfig(
enabled=True,
backend="tesseract",
language="eng",
),
# Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
chunking=ChunkingConfig(
max_characters=800,
overlap=100,
chunker_type=ChunkerType.Markdown,
prepend_heading_context=True,
),
# Output: Markdown format with document structure preserved
output_format=OutputFormat.Markdown,
include_document_structure=True,
# Images: extract embedded images
images=ImageExtractionConfig(
extract_images=True,
),
# Cache extracted results on disk
use_cache=True,
)
result = extract_file_sync("report.pdf", config=config)
print(f"Content ({len(result.content)} chars):")
print(result.content[:200])
if result.chunks:
print(f"\nChunks: {len(result.chunks)}")
print(f"Tables: {len(result.tables)}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
if result.extraction_method:
print(f"Extraction method: {result.extraction_method}")