Files
fil/docs/snippets/python/api/combining_all_features.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

1.3 KiB

from kreuzberg import (
    ExtractionConfig,
    OcrConfig,
    ChunkingConfig,
    ChunkerType,
    ImageExtractionConfig,
    OutputFormat,
    extract_file_sync,
)

config = ExtractionConfig(
    # OCR: extract text from images, fallback to Tesseract
    ocr=OcrConfig(
        enabled=True,
        backend="tesseract",
        language="eng",
    ),
    # Chunking: semantic markdown chunks of ~800 chars, 100-char overlap
    chunking=ChunkingConfig(
        max_characters=800,
        overlap=100,
        chunker_type=ChunkerType.Markdown,
        prepend_heading_context=True,
    ),
    # Output: Markdown format with document structure preserved
    output_format=OutputFormat.Markdown,
    include_document_structure=True,
    # Images: extract embedded images
    images=ImageExtractionConfig(
        extract_images=True,
    ),
    # Cache extracted results on disk
    use_cache=True,
)

result = extract_file_sync("report.pdf", config=config)

print(f"Content ({len(result.content)} chars):")
print(result.content[:200])

if result.chunks:
    print(f"\nChunks: {len(result.chunks)}")

print(f"Tables: {len(result.tables)}")

if result.detected_languages:
    print(f"Languages: {result.detected_languages}")

if result.extraction_method:
    print(f"Extraction method: {result.extraction_method}")