27 lines
625 B
Markdown
27 lines
625 B
Markdown
```python title="Python"
|
|
from kreuzberg import (
|
|
extract_file_sync,
|
|
ExtractionConfig,
|
|
OcrConfig,
|
|
TesseractConfig,
|
|
ImagePreprocessingConfig,
|
|
)
|
|
|
|
config: ExtractionConfig = ExtractionConfig(
|
|
ocr=OcrConfig(
|
|
backend="tesseract",
|
|
tesseract_config=TesseractConfig(
|
|
preprocessing=ImagePreprocessingConfig(target_dpi=300),
|
|
),
|
|
),
|
|
)
|
|
|
|
result = extract_file_sync("scanned.pdf", config=config)
|
|
|
|
content_length: int = len(result.content)
|
|
table_count: int = len(result.tables)
|
|
|
|
print(f"Content length: {content_length} characters")
|
|
print(f"Tables detected: {table_count}")
|
|
```
|