This commit is contained in:
93
packages/python/LICENSE
generated
Normal file
93
packages/python/LICENSE
generated
Normal file
@@ -0,0 +1,93 @@
|
||||
Elastic License 2.0 (ELv2)
|
||||
|
||||
Copyright 2025-2026 Kreuzberg, Inc.
|
||||
|
||||
Acceptance
|
||||
|
||||
By using the software, you agree to all of the terms and conditions below.
|
||||
|
||||
Copyright License
|
||||
|
||||
The licensor grants you a non-exclusive, royalty-free, worldwide,
|
||||
non-sublicensable, non-transferable license to use, copy, distribute, make
|
||||
available, and prepare derivative works of the software, in each case subject to
|
||||
the limitations and conditions below.
|
||||
|
||||
Limitations
|
||||
|
||||
You may not provide the software to third parties as a hosted or managed
|
||||
service, where the service provides users with access to any substantial set of
|
||||
the features or functionality of the software.
|
||||
|
||||
You may not move, change, disable, or circumvent the license key functionality
|
||||
in the software, and you may not remove or obscure any functionality in the
|
||||
software that is protected by the license key.
|
||||
|
||||
You may not alter, remove, or obscure any licensing, copyright, or other notices
|
||||
of the licensor in the software. Any use of the licensor's trademarks is subject
|
||||
to applicable law.
|
||||
|
||||
Patents
|
||||
|
||||
The licensor grants you a license, under any patent claims the licensor can
|
||||
license, or becomes able to license, to make, have made, use, sell, offer for
|
||||
sale, import and have imported the software, in each case subject to the
|
||||
limitations and conditions in this license. This license does not cover any
|
||||
patent claims that you cause to be infringed by modifications or additions to the
|
||||
software. If you or your company make any written claim that the software
|
||||
infringes or contributes to infringement of any patent, your patent license for
|
||||
the software granted under these terms ends immediately. If your company makes
|
||||
such a claim, your patent license ends immediately for work on behalf of your
|
||||
company.
|
||||
|
||||
Notices
|
||||
|
||||
You must ensure that anyone who gets a copy of any part of the software from you
|
||||
also gets a copy of these terms.
|
||||
|
||||
If you modify the software, you must include in any modified copies of the
|
||||
software prominent notices stating that you have modified the software.
|
||||
|
||||
No Other Rights
|
||||
|
||||
These terms do not imply any licenses other than those expressly granted in
|
||||
these terms.
|
||||
|
||||
Termination
|
||||
|
||||
If you use the software in violation of these terms, such use is not licensed,
|
||||
and your licenses will automatically terminate. If the licensor provides you with
|
||||
a notice of your violation, and you cease all violation of this license no later
|
||||
than 30 days after you receive that notice, your licenses will be reinstated
|
||||
retroactively. However, if you violate these terms after such reinstatement, any
|
||||
additional violation of these terms will cause your licenses to terminate
|
||||
automatically and permanently.
|
||||
|
||||
No Liability
|
||||
|
||||
As far as the law allows, the software comes as is, without any warranty or
|
||||
condition, and the licensor will not be liable to you for any damages arising out
|
||||
of these terms or the use or nature of the software, under any kind of legal
|
||||
claim.
|
||||
|
||||
Definitions
|
||||
|
||||
The licensor is the entity offering these terms, and the software is the
|
||||
software the licensor makes available under these terms, including any portion
|
||||
of it.
|
||||
|
||||
you refers to the individual or entity agreeing to these terms.
|
||||
|
||||
your company is any legal entity, sole proprietorship, or other kind of
|
||||
organization that you work for, plus all organizations that have control over,
|
||||
are under the control of, or are under common control with that organization.
|
||||
control means ownership of substantially all the assets of an entity, or the
|
||||
power to direct its management and policies by vote, contract, or otherwise.
|
||||
Control can be direct or indirect.
|
||||
|
||||
your licenses are all the licenses granted to you for the software under these
|
||||
terms.
|
||||
|
||||
use means anything you do with the software requiring one of your licenses.
|
||||
|
||||
trademark means trademarks, service marks, and similar rights.
|
||||
600
packages/python/README.md
generated
Normal file
600
packages/python/README.md
generated
Normal file
@@ -0,0 +1,600 @@
|
||||
# Kreuzberg
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
||||
<a href="https://github.com/kreuzberg-dev/alef">
|
||||
<img src="https://img.shields.io/badge/Bindings-alef%20%D7%90-007ec6" alt="Bindings">
|
||||
</a>
|
||||
<!-- Language Bindings -->
|
||||
<a href="https://crates.io/crates/kreuzberg">
|
||||
<img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/kreuzberg/">
|
||||
<img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/node">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
|
||||
</a>
|
||||
<a href="https://www.npmjs.com/package/@kreuzberg/wasm">
|
||||
<img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/go/v5">
|
||||
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v5*" alt="Go">
|
||||
</a>
|
||||
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
||||
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
||||
</a>
|
||||
<a href="https://packagist.org/packages/kreuzberg/kreuzberg">
|
||||
<img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
|
||||
</a>
|
||||
<a href="https://rubygems.org/gems/kreuzberg">
|
||||
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
|
||||
</a>
|
||||
<a href="https://hex.pm/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
|
||||
</a>
|
||||
<a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
|
||||
</a>
|
||||
<a href="https://pub.dev/packages/kreuzberg">
|
||||
<img src="https://img.shields.io/pub/v/kreuzberg?label=Dart&color=007ec6" alt="Dart">
|
||||
</a>
|
||||
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg-android">
|
||||
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg-android?label=Kotlin&color=007ec6" alt="Kotlin">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/swift">
|
||||
<img src="https://img.shields.io/badge/Swift-SPM-007ec6" alt="Swift">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/tree/main/packages/zig">
|
||||
<img src="https://img.shields.io/badge/Zig-package-007ec6" alt="Zig">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
||||
<img src="https://img.shields.io/badge/C-FFI-007ec6" alt="C FFI">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Docker-ghcr.io-007ec6?logo=docker&logoColor=white" alt="Docker">
|
||||
</a>
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/charts%2Fkreuzberg">
|
||||
<img src="https://img.shields.io/badge/Helm-ghcr.io-007ec6?logo=helm&logoColor=white" alt="Helm">
|
||||
</a>
|
||||
|
||||
<!-- Project Info -->
|
||||
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
||||
<img src="https://img.shields.io/badge/License-Elastic--2.0-007ec6" alt="License">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev">
|
||||
<img src="https://img.shields.io/badge/Docs-kreuzberg-007ec6" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://huggingface.co/Kreuzberg">
|
||||
<img src="https://img.shields.io/badge/Hugging%20Face-Kreuzberg-007ec6" alt="Hugging Face">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="margin: 24px 0 0;">
|
||||
<a href="https://kreuzberg.dev">
|
||||
<img alt="Kreuzberg" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div align="center" style="display: flex; flex-wrap: wrap; gap: 12px; justify-content: center; margin: 28px 0 24px;">
|
||||
<a href="https://discord.gg/xt9WY3GnKR">
|
||||
<img height="22" src="https://img.shields.io/badge/Discord-Chat-007ec6?logo=discord&logoColor=white" alt="Join Discord">
|
||||
</a>
|
||||
<a href="https://docs.kreuzberg.dev/demo.html">
|
||||
<img height="22" src="https://img.shields.io/badge/Live%20Demo-Open-007ec6?logo=webassembly&logoColor=white" alt="Live Demo">
|
||||
</a>
|
||||
</div>
|
||||
|
||||
Extract text, tables, images, and metadata from 90+ file formats and 300+ programming languages including PDF, Office documents, and images. Native Python bindings with async/await support, multiple OCR backends (Tesseract, EasyOCR, PaddleOCR), and extensible plugin system.
|
||||
|
||||
## What This Package Provides
|
||||
|
||||
- **Python-native extraction** — sync and async APIs for files, bytes, URLs, and batch ingestion.
|
||||
- **Structured results** — text, tables, images, metadata, language detection, chunks, and warnings in typed Python objects.
|
||||
- **OCR choices** — Tesseract, EasyOCR, PaddleOCR, and VLM OCR where configured.
|
||||
- **Same Rust engine as every binding** — behavior matches the Node.js, Ruby, Go, Java, .NET, PHP, Elixir, R, Dart, Swift, Zig, WASM, and C FFI packages.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install kreuzberg
|
||||
```
|
||||
|
||||
### With OCR Support
|
||||
|
||||
```bash
|
||||
pip install "kreuzberg[easyocr]"
|
||||
pip install "kreuzberg[paddleocr]"
|
||||
```
|
||||
|
||||
### All Features
|
||||
|
||||
```bash
|
||||
pip install "kreuzberg[all]"
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True
|
||||
)
|
||||
result = await extract_file("document.pdf", config=config)
|
||||
print(result.content)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Simple Extraction
|
||||
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from kreuzberg import extract_file
|
||||
|
||||
async def main() -> None:
|
||||
file_path: Path = Path("document.pdf")
|
||||
|
||||
result = await extract_file(file_path)
|
||||
|
||||
print(f"Content: {result.content}")
|
||||
print(f"Format: {result.metadata.format.format_type if result.metadata.format else None}")
|
||||
print(f"Tables: {len(result.tables)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Reading Content
|
||||
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file
|
||||
|
||||
async def main() -> None:
|
||||
result = await extract_file("document.pdf")
|
||||
|
||||
content: str = result.content
|
||||
tables: int = len(result.tables)
|
||||
format_type: str | None = result.metadata.format.format_type if result.metadata.format else None
|
||||
|
||||
print(f"Content length: {len(content)} characters")
|
||||
print(f"Tables found: {tables}")
|
||||
print(f"Format: {format_type}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## OCR Support
|
||||
|
||||
### Using OCR
|
||||
|
||||
```python title="Python"
|
||||
import asyncio
|
||||
from kreuzberg import extract_file, ExtractionConfig, OcrConfig, TesseractConfig
|
||||
|
||||
async def main() -> None:
|
||||
config = ExtractionConfig(
|
||||
force_ocr=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
tesseract_config=TesseractConfig(psm=3)
|
||||
)
|
||||
)
|
||||
result = await extract_file("scanned.pdf", config=config)
|
||||
print(result.content)
|
||||
print(f"Detected Languages: {result.detected_languages}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### EasyOCR (GPU-Accelerated)
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="easyocr", language="en")
|
||||
)
|
||||
|
||||
result = extract_file_sync(
|
||||
"photo.jpg",
|
||||
config=config,
|
||||
easyocr_kwargs={"use_gpu": True}
|
||||
)
|
||||
```
|
||||
|
||||
### PaddleOCR (Complex Layouts)
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(backend="paddleocr", language="ch")
|
||||
)
|
||||
|
||||
result = extract_file_sync(
|
||||
"invoice.pdf",
|
||||
config=config,
|
||||
)
|
||||
```
|
||||
|
||||
## Table Extraction
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
tesseract_config=TesseractConfig(
|
||||
enable_table_detection=True
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("invoice.pdf", config=config)
|
||||
|
||||
for table in result.tables:
|
||||
print(table.markdown)
|
||||
print(table.cells)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Complete Configuration Example
|
||||
|
||||
```python
|
||||
from kreuzberg import (
|
||||
extract_file_sync,
|
||||
ExtractionConfig,
|
||||
OcrConfig,
|
||||
TesseractConfig,
|
||||
ChunkingConfig,
|
||||
ImageExtractionConfig,
|
||||
PdfConfig,
|
||||
TokenReductionConfig,
|
||||
LanguageDetectionConfig,
|
||||
)
|
||||
|
||||
config = ExtractionConfig(
|
||||
use_cache=True,
|
||||
enable_quality_processing=True,
|
||||
ocr=OcrConfig(
|
||||
backend="tesseract",
|
||||
language="eng",
|
||||
tesseract_config=TesseractConfig(
|
||||
psm=6,
|
||||
enable_table_detection=True,
|
||||
min_confidence=50.0,
|
||||
),
|
||||
),
|
||||
force_ocr=False,
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
),
|
||||
images=ImageExtractionConfig(
|
||||
extract_images=True,
|
||||
target_dpi=300,
|
||||
max_image_dimension=4096,
|
||||
auto_adjust_dpi=True,
|
||||
),
|
||||
pdf_options=PdfConfig(
|
||||
extract_images=True,
|
||||
passwords=["password1", "password2"],
|
||||
extract_metadata=True,
|
||||
),
|
||||
token_reduction=TokenReductionConfig(
|
||||
mode="moderate",
|
||||
preserve_important_words=True,
|
||||
),
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=False,
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
```
|
||||
|
||||
### HTML Conversion Options & Batch Concurrency
|
||||
|
||||
```python
|
||||
from kreuzberg import ExtractionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
max_concurrent_extractions=8,
|
||||
html_options={
|
||||
"extract_metadata": True,
|
||||
"wrap": True,
|
||||
"wrap_width": 100,
|
||||
"strip_tags": ["script", "style"],
|
||||
"preprocessing": {"enabled": True, "preset": "standard"},
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
## Metadata Extraction
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync
|
||||
|
||||
result = extract_file_sync("document.pdf")
|
||||
|
||||
if result.images:
|
||||
print(f"Extracted {len(result.images)} inline images")
|
||||
|
||||
if result.chunks:
|
||||
print(f"First chunk tokens: {result.chunks[0]['metadata']['token_count']}")
|
||||
|
||||
print(result.metadata.get("pdf", {}))
|
||||
print(result.metadata.get("language"))
|
||||
print(result.metadata.get("format"))
|
||||
|
||||
if "pdf" in result.metadata:
|
||||
pdf_meta = result.metadata["pdf"]
|
||||
print(f"Title: {pdf_meta.get('title')}")
|
||||
print(f"Author: {pdf_meta.get('author')}")
|
||||
print(f"Pages: {pdf_meta.get('page_count')}")
|
||||
print(f"Created: {pdf_meta.get('creation_date')}")
|
||||
```
|
||||
|
||||
## Password-Protected PDFs
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
pdf_options=PdfConfig(
|
||||
passwords=["password1", "password2", "password3"]
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("protected.pdf", config=config)
|
||||
```
|
||||
|
||||
## Language Detection
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(enabled=True)
|
||||
)
|
||||
|
||||
result = extract_file_sync("multilingual.pdf", config=config)
|
||||
print(result.detected_languages)
|
||||
```
|
||||
|
||||
## Text Chunking
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_chars=1000,
|
||||
max_overlap=200,
|
||||
)
|
||||
)
|
||||
|
||||
result = extract_file_sync("long_document.pdf", config=config)
|
||||
|
||||
for chunk in result.chunks:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
## Extract from Bytes
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_bytes_sync
|
||||
|
||||
with open("document.pdf", "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
result = extract_bytes_sync(data, "application/pdf")
|
||||
print(result.content)
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
### Extraction Functions
|
||||
|
||||
- `extract_file(file_path, mime_type=None, config=None, **kwargs)` – Async extraction
|
||||
- `extract_file_sync(file_path, mime_type=None, config=None, **kwargs)` – Sync extraction
|
||||
- `extract_bytes(data, mime_type, config=None, **kwargs)` – Async extraction from bytes
|
||||
- `extract_bytes_sync(data, mime_type, config=None, **kwargs)` – Sync extraction from bytes
|
||||
- `batch_extract_files(paths, config=None, **kwargs)` – Async batch extraction
|
||||
- `batch_extract_files_sync(paths, config=None, **kwargs)` – Sync batch extraction
|
||||
- `batch_extract_bytes(data_list, mime_types, config=None, **kwargs)` – Async batch from bytes
|
||||
- `batch_extract_bytes_sync(data_list, mime_types, config=None, **kwargs)` – Sync batch from bytes
|
||||
|
||||
### Configuration Classes
|
||||
|
||||
- `ExtractionConfig` – Main configuration
|
||||
- `OcrConfig` – OCR settings
|
||||
- `TesseractConfig` – Tesseract-specific options
|
||||
- `ChunkingConfig` – Text chunking settings
|
||||
- `ImageExtractionConfig` – Image extraction settings
|
||||
- `PdfConfig` – PDF-specific options
|
||||
- `TokenReductionConfig` – Token reduction settings
|
||||
- `LanguageDetectionConfig` – Language detection settings
|
||||
|
||||
### Result Types
|
||||
|
||||
- `ExtractionResult` – Main result object with `content`, `metadata`, `tables`, `detected_languages`, `chunks`
|
||||
- `ExtractedTable` – Table with `cells`, `markdown`, `page_number`
|
||||
- `Metadata` – Typed metadata dictionary
|
||||
|
||||
### Exceptions
|
||||
|
||||
- `KreuzbergError` – Base exception
|
||||
- `ValidationError` – Invalid configuration or input
|
||||
- `ParsingError` – Document parsing failure
|
||||
- `OCRError` – OCR processing failure
|
||||
- `MissingDependencyError` – Missing optional dependency
|
||||
|
||||
## Examples
|
||||
|
||||
### Custom Processing
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync
|
||||
|
||||
result = extract_file_sync("document.pdf")
|
||||
|
||||
text = result.content
|
||||
text = text.lower()
|
||||
text = text.replace("old", "new")
|
||||
|
||||
print(text)
|
||||
```
|
||||
|
||||
### Multiple Files with Progress
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync
|
||||
from pathlib import Path
|
||||
|
||||
files = list(Path("documents").glob("*.pdf"))
|
||||
results = []
|
||||
|
||||
for i, file in enumerate(files, 1):
|
||||
print(f"Processing {i}/{len(files)}: {file.name}")
|
||||
result = extract_file_sync(str(file))
|
||||
results.append((file.name, result))
|
||||
|
||||
for name, result in results:
|
||||
print(f"{name}: {len(result.content)} characters")
|
||||
```
|
||||
|
||||
### Filter by Language
|
||||
|
||||
```python
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(enabled=True)
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
if result.detected_languages and "en" in result.detected_languages:
|
||||
print("English document detected")
|
||||
print(result.content)
|
||||
```
|
||||
|
||||
## System Requirements
|
||||
|
||||
### ONNX Runtime (for embeddings)
|
||||
|
||||
If using embeddings functionality, ONNX Runtime version 1.22.x must be installed:
|
||||
|
||||
```bash
|
||||
# macOS
|
||||
brew install onnxruntime
|
||||
|
||||
# Ubuntu/Debian (download from GitHub - Debian packages may have older versions)
|
||||
# Download from https://github.com/microsoft/onnxruntime/releases
|
||||
|
||||
# Windows
|
||||
# Download from https://github.com/microsoft/onnxruntime/releases
|
||||
```
|
||||
|
||||
**Important:** Kreuzberg requires ONNX Runtime version 1.22.x for embeddings.
|
||||
|
||||
Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
|
||||
|
||||
### Tesseract OCR (Required for OCR)
|
||||
|
||||
```bash
|
||||
brew install tesseract
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo apt-get install tesseract-ocr
|
||||
```
|
||||
|
||||
### Pandoc (Optional, for some formats)
|
||||
|
||||
```bash
|
||||
brew install pandoc
|
||||
```
|
||||
|
||||
```bash
|
||||
sudo apt-get install pandoc
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Import Error: No module named '\_kreuzberg'
|
||||
|
||||
This usually means the Rust extension wasn't built correctly. Try:
|
||||
|
||||
```bash
|
||||
pip install --force-reinstall --no-cache-dir kreuzberg
|
||||
```
|
||||
|
||||
### OCR Not Working
|
||||
|
||||
Make sure Tesseract is installed:
|
||||
|
||||
```bash
|
||||
tesseract --version
|
||||
```
|
||||
|
||||
### Memory Issues with Large PDFs
|
||||
|
||||
Use streaming or enable chunking:
|
||||
|
||||
```python
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(max_chars=1000)
|
||||
)
|
||||
```
|
||||
|
||||
## PDFium Integration
|
||||
|
||||
PDF extraction is powered by PDFium, which is automatically bundled with this package. No system installation required.
|
||||
|
||||
### Platform Support
|
||||
|
||||
| Platform | Status | Notes |
|
||||
| -------------- | ------ | ------- |
|
||||
| Linux x86_64 | ✅ | Bundled |
|
||||
| macOS ARM64 | ✅ | Bundled |
|
||||
| macOS x86_64 | ✅ | Bundled |
|
||||
| Windows x86_64 | ✅ | Bundled |
|
||||
|
||||
### Binary Size Impact
|
||||
|
||||
PDFium adds approximately 8-15 MB to the package size depending on platform. This ensures consistent PDF extraction across all environments without external dependencies.
|
||||
|
||||
## Documentation
|
||||
|
||||
For comprehensive documentation, visit [https://kreuzberg.dev](https://kreuzberg.dev)
|
||||
|
||||
## Part of Kreuzberg.dev
|
||||
|
||||
- [Kreuzberg Cloud](https://github.com/kreuzberg-dev/kreuzberg-cloud) — managed extraction API with SDKs, dashboards, and observability.
|
||||
- [kreuzcrawl](https://github.com/kreuzberg-dev/kreuzcrawl) — web crawling and scraping with HTML→Markdown and headless-Chrome fallback.
|
||||
- [html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown) — fast, lossless HTML→Markdown engine.
|
||||
- [liter-llm](https://github.com/kreuzberg-dev/liter-llm) — universal LLM API client with native bindings for 14 languages and 143 providers.
|
||||
- [tree-sitter-language-pack](https://github.com/kreuzberg-dev/tree-sitter-language-pack) — tree-sitter grammars and code-intelligence primitives.
|
||||
- [alef](https://github.com/kreuzberg-dev/alef) — the polyglot binding generator that produces this README and all per-language bindings.
|
||||
- [Discord](https://discord.gg/xt9WY3GnKR) — community, roadmap, announcements.
|
||||
|
||||
## License
|
||||
|
||||
Elastic-2.0 License - see [LICENSE](../../LICENSE) for details.
|
||||
497
packages/python/kreuzberg/__init__.py
generated
Normal file
497
packages/python/kreuzberg/__init__.py
generated
Normal file
@@ -0,0 +1,497 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
"""Public API for _kreuzberg.
|
||||
|
||||
Version: 5.0.0-rc.3
|
||||
"""
|
||||
|
||||
from ._kreuzberg import (
|
||||
AnnotationKind,
|
||||
ArchiveEntry,
|
||||
BatchBytesItem,
|
||||
BatchFileItem,
|
||||
BBox,
|
||||
BlockType,
|
||||
CacheStats,
|
||||
CellChange,
|
||||
Chunk,
|
||||
ChunkerType,
|
||||
ChunkMetadata,
|
||||
ChunkSizing,
|
||||
ChunkType,
|
||||
CodeContentMode,
|
||||
ContentLayer,
|
||||
ContributorRole,
|
||||
DbfFieldInfo,
|
||||
DetectionResult,
|
||||
DetectResponse,
|
||||
DiffHunk,
|
||||
DiffLine,
|
||||
DjotContent,
|
||||
DjotImage,
|
||||
DjotLink,
|
||||
DocumentNode,
|
||||
DocumentRelationship,
|
||||
DocumentRevision,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ElementType,
|
||||
EmailAttachment,
|
||||
EmailExtractionResult,
|
||||
EmbeddedChanges,
|
||||
EmbeddedDiff,
|
||||
EmbeddedFile,
|
||||
EmbeddingModelType,
|
||||
EmbeddingPreset,
|
||||
ErrorMetadata,
|
||||
ExcelSheet,
|
||||
ExcelWorkbook,
|
||||
ExecutionProviderType,
|
||||
ExtractedImage,
|
||||
ExtractedUri,
|
||||
ExtractionDiff,
|
||||
ExtractionMethod,
|
||||
Footnote,
|
||||
FormatMetadata,
|
||||
FormattedBlock,
|
||||
GridCell,
|
||||
HeaderMetadata,
|
||||
HeadingContext,
|
||||
HeadingLevel,
|
||||
HierarchicalBlock,
|
||||
HtmlTheme,
|
||||
ImageKind,
|
||||
ImageMetadataType,
|
||||
ImagePreprocessingMetadata,
|
||||
ImageType,
|
||||
InlineElement,
|
||||
InlineType,
|
||||
Keyword,
|
||||
KeywordAlgorithm,
|
||||
LayoutClass,
|
||||
LayoutDetection,
|
||||
LinkMetadata,
|
||||
LinkType,
|
||||
ListType,
|
||||
ModelPaths,
|
||||
NodeContent,
|
||||
OcrBackendType,
|
||||
OcrBoundingGeometry,
|
||||
OcrElementLevel,
|
||||
OcrExtractionResult,
|
||||
OcrPipelineConfig,
|
||||
OcrPipelineStage,
|
||||
OcrRotation,
|
||||
OcrTable,
|
||||
OcrTableBoundingBox,
|
||||
OrientationResult,
|
||||
OutputFormat,
|
||||
PaddleLanguage,
|
||||
PageBoundary,
|
||||
PageContent,
|
||||
PageHierarchy,
|
||||
PageInfo,
|
||||
PageStructure,
|
||||
PageUnitType,
|
||||
PdfAnnotation,
|
||||
PdfAnnotationType,
|
||||
PptxExtractionResult,
|
||||
ProcessingStage,
|
||||
ProcessingWarning,
|
||||
PSMMode,
|
||||
RecognizedTable,
|
||||
ReductionLevel,
|
||||
RelationshipKind,
|
||||
ResultFormat,
|
||||
RevisionAnchor,
|
||||
RevisionKind,
|
||||
StructuredData,
|
||||
StructuredDataResult,
|
||||
StructuredDataType,
|
||||
StructuredExtractionConfig,
|
||||
SupportedFormat,
|
||||
TableDiff,
|
||||
TableModel,
|
||||
TextAnnotation,
|
||||
TextDirection,
|
||||
TextExtractionResult,
|
||||
UriKind,
|
||||
XmlExtractionResult,
|
||||
YearRange,
|
||||
)
|
||||
from .api import (
|
||||
batch_extract_bytes,
|
||||
batch_extract_bytes_sync,
|
||||
batch_extract_files,
|
||||
batch_extract_files_sync,
|
||||
clear_document_extractors,
|
||||
clear_embedding_backends,
|
||||
clear_ocr_backends,
|
||||
clear_post_processors,
|
||||
clear_renderers,
|
||||
clear_validators,
|
||||
compare,
|
||||
detect_mime_type,
|
||||
detect_mime_type_from_bytes,
|
||||
embed_texts,
|
||||
embed_texts_async,
|
||||
extract_bytes,
|
||||
extract_bytes_sync,
|
||||
extract_file,
|
||||
extract_file_sync,
|
||||
get_embedding_preset,
|
||||
get_extensions_for_mime,
|
||||
list_document_extractors,
|
||||
list_embedding_backends,
|
||||
list_embedding_presets,
|
||||
list_ocr_backends,
|
||||
list_post_processors,
|
||||
list_renderers,
|
||||
list_validators,
|
||||
register_document_extractor,
|
||||
register_embedding_backend,
|
||||
register_ocr_backend,
|
||||
register_post_processor,
|
||||
register_renderer,
|
||||
register_validator,
|
||||
render_pdf_page_to_png,
|
||||
unregister_document_extractor,
|
||||
unregister_embedding_backend,
|
||||
unregister_ocr_backend,
|
||||
unregister_post_processor,
|
||||
unregister_renderer,
|
||||
unregister_validator,
|
||||
)
|
||||
from .exceptions import (
|
||||
CacheError,
|
||||
CancelledError,
|
||||
EmbeddingError,
|
||||
ImageProcessingError,
|
||||
IoError,
|
||||
KreuzbergError,
|
||||
KreuzbergTimeoutError,
|
||||
LockPoisonedError,
|
||||
MissingDependencyError,
|
||||
OcrError,
|
||||
OtherError,
|
||||
ParsingError,
|
||||
PluginError,
|
||||
SecurityError,
|
||||
SerializationError,
|
||||
UnsupportedFormatError,
|
||||
ValidationError,
|
||||
)
|
||||
from .options import (
|
||||
AccelerationConfig,
|
||||
ArchiveMetadata,
|
||||
BibtexMetadata,
|
||||
BoundingBox,
|
||||
ChunkingConfig,
|
||||
CitationMetadata,
|
||||
ContentFilterConfig,
|
||||
CoreProperties,
|
||||
CsvMetadata,
|
||||
DbfMetadata,
|
||||
DiffOptions,
|
||||
DocumentStructure,
|
||||
DocxAppProperties,
|
||||
DocxMetadata,
|
||||
EmailConfig,
|
||||
EmailMetadata,
|
||||
EmbeddingConfig,
|
||||
EpubMetadata,
|
||||
ExcelMetadata,
|
||||
ExtractionConfig,
|
||||
ExtractionResult,
|
||||
FictionBookMetadata,
|
||||
FileExtractionConfig,
|
||||
HierarchyConfig,
|
||||
HtmlMetadata,
|
||||
HtmlOutputConfig,
|
||||
ImageExtractionConfig,
|
||||
ImageMetadata,
|
||||
ImagePreprocessingConfig,
|
||||
JatsMetadata,
|
||||
KeywordConfig,
|
||||
LanguageDetectionConfig,
|
||||
LayoutDetectionConfig,
|
||||
LayoutRegion,
|
||||
LlmConfig,
|
||||
LlmUsage,
|
||||
Metadata,
|
||||
OcrConfidence,
|
||||
OcrConfig,
|
||||
OcrElement,
|
||||
OcrElementConfig,
|
||||
OcrMetadata,
|
||||
OcrQualityThresholds,
|
||||
PaddleOcrConfig,
|
||||
PageConfig,
|
||||
PdfConfig,
|
||||
PdfMetadata,
|
||||
PostProcessorConfig,
|
||||
PptxAppProperties,
|
||||
PptxMetadata,
|
||||
PstMetadata,
|
||||
RakeParams,
|
||||
RevisionDelta,
|
||||
SecurityLimits,
|
||||
ServerConfig,
|
||||
Table,
|
||||
TableCell,
|
||||
TableGrid,
|
||||
TesseractConfig,
|
||||
TextMetadata,
|
||||
TokenReductionConfig,
|
||||
TokenReductionOptions,
|
||||
TreeSitterConfig,
|
||||
TreeSitterProcessConfig,
|
||||
XlsxAppProperties,
|
||||
XmlMetadata,
|
||||
YakeParams,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AccelerationConfig",
|
||||
"AnnotationKind",
|
||||
"ArchiveEntry",
|
||||
"ArchiveMetadata",
|
||||
"BBox",
|
||||
"BatchBytesItem",
|
||||
"BatchFileItem",
|
||||
"BibtexMetadata",
|
||||
"BlockType",
|
||||
"BoundingBox",
|
||||
"CacheError",
|
||||
"CacheStats",
|
||||
"CancelledError",
|
||||
"CellChange",
|
||||
"Chunk",
|
||||
"ChunkMetadata",
|
||||
"ChunkSizing",
|
||||
"ChunkType",
|
||||
"ChunkerType",
|
||||
"ChunkingConfig",
|
||||
"CitationMetadata",
|
||||
"CodeContentMode",
|
||||
"ContentFilterConfig",
|
||||
"ContentLayer",
|
||||
"ContributorRole",
|
||||
"CoreProperties",
|
||||
"CsvMetadata",
|
||||
"DbfFieldInfo",
|
||||
"DbfMetadata",
|
||||
"DetectResponse",
|
||||
"DetectionResult",
|
||||
"DiffHunk",
|
||||
"DiffLine",
|
||||
"DiffOptions",
|
||||
"DjotContent",
|
||||
"DjotImage",
|
||||
"DjotLink",
|
||||
"DocumentNode",
|
||||
"DocumentRelationship",
|
||||
"DocumentRevision",
|
||||
"DocumentStructure",
|
||||
"DocxAppProperties",
|
||||
"DocxMetadata",
|
||||
"Element",
|
||||
"ElementMetadata",
|
||||
"ElementType",
|
||||
"EmailAttachment",
|
||||
"EmailConfig",
|
||||
"EmailExtractionResult",
|
||||
"EmailMetadata",
|
||||
"EmbeddedChanges",
|
||||
"EmbeddedDiff",
|
||||
"EmbeddedFile",
|
||||
"EmbeddingConfig",
|
||||
"EmbeddingError",
|
||||
"EmbeddingModelType",
|
||||
"EmbeddingPreset",
|
||||
"EpubMetadata",
|
||||
"ErrorMetadata",
|
||||
"ExcelMetadata",
|
||||
"ExcelSheet",
|
||||
"ExcelWorkbook",
|
||||
"ExecutionProviderType",
|
||||
"ExtractedImage",
|
||||
"ExtractedUri",
|
||||
"ExtractionConfig",
|
||||
"ExtractionDiff",
|
||||
"ExtractionMethod",
|
||||
"ExtractionResult",
|
||||
"FictionBookMetadata",
|
||||
"FileExtractionConfig",
|
||||
"Footnote",
|
||||
"FormatMetadata",
|
||||
"FormattedBlock",
|
||||
"GridCell",
|
||||
"HeaderMetadata",
|
||||
"HeadingContext",
|
||||
"HeadingLevel",
|
||||
"HierarchicalBlock",
|
||||
"HierarchyConfig",
|
||||
"HtmlMetadata",
|
||||
"HtmlOutputConfig",
|
||||
"HtmlTheme",
|
||||
"ImageExtractionConfig",
|
||||
"ImageKind",
|
||||
"ImageMetadata",
|
||||
"ImageMetadataType",
|
||||
"ImagePreprocessingConfig",
|
||||
"ImagePreprocessingMetadata",
|
||||
"ImageProcessingError",
|
||||
"ImageType",
|
||||
"InlineElement",
|
||||
"InlineType",
|
||||
"IoError",
|
||||
"JatsMetadata",
|
||||
"Keyword",
|
||||
"KeywordAlgorithm",
|
||||
"KeywordConfig",
|
||||
"KreuzbergError",
|
||||
"KreuzbergTimeoutError",
|
||||
"LanguageDetectionConfig",
|
||||
"LayoutClass",
|
||||
"LayoutDetection",
|
||||
"LayoutDetectionConfig",
|
||||
"LayoutRegion",
|
||||
"LinkMetadata",
|
||||
"LinkType",
|
||||
"ListType",
|
||||
"LlmConfig",
|
||||
"LlmUsage",
|
||||
"LockPoisonedError",
|
||||
"Metadata",
|
||||
"MissingDependencyError",
|
||||
"ModelPaths",
|
||||
"NodeContent",
|
||||
"OcrBackendType",
|
||||
"OcrBoundingGeometry",
|
||||
"OcrConfidence",
|
||||
"OcrConfig",
|
||||
"OcrElement",
|
||||
"OcrElementConfig",
|
||||
"OcrElementLevel",
|
||||
"OcrError",
|
||||
"OcrExtractionResult",
|
||||
"OcrMetadata",
|
||||
"OcrPipelineConfig",
|
||||
"OcrPipelineStage",
|
||||
"OcrQualityThresholds",
|
||||
"OcrRotation",
|
||||
"OcrTable",
|
||||
"OcrTableBoundingBox",
|
||||
"OrientationResult",
|
||||
"OtherError",
|
||||
"OutputFormat",
|
||||
"PSMMode",
|
||||
"PaddleLanguage",
|
||||
"PaddleOcrConfig",
|
||||
"PageBoundary",
|
||||
"PageConfig",
|
||||
"PageContent",
|
||||
"PageHierarchy",
|
||||
"PageInfo",
|
||||
"PageStructure",
|
||||
"PageUnitType",
|
||||
"ParsingError",
|
||||
"PdfAnnotation",
|
||||
"PdfAnnotationType",
|
||||
"PdfConfig",
|
||||
"PdfMetadata",
|
||||
"PluginError",
|
||||
"PostProcessorConfig",
|
||||
"PptxAppProperties",
|
||||
"PptxExtractionResult",
|
||||
"PptxMetadata",
|
||||
"ProcessingStage",
|
||||
"ProcessingWarning",
|
||||
"PstMetadata",
|
||||
"RakeParams",
|
||||
"RecognizedTable",
|
||||
"ReductionLevel",
|
||||
"RelationshipKind",
|
||||
"ResultFormat",
|
||||
"RevisionAnchor",
|
||||
"RevisionDelta",
|
||||
"RevisionKind",
|
||||
"SecurityError",
|
||||
"SecurityLimits",
|
||||
"SerializationError",
|
||||
"ServerConfig",
|
||||
"StructuredData",
|
||||
"StructuredDataResult",
|
||||
"StructuredDataType",
|
||||
"StructuredExtractionConfig",
|
||||
"SupportedFormat",
|
||||
"Table",
|
||||
"TableCell",
|
||||
"TableDiff",
|
||||
"TableGrid",
|
||||
"TableModel",
|
||||
"TesseractConfig",
|
||||
"TextAnnotation",
|
||||
"TextDirection",
|
||||
"TextExtractionResult",
|
||||
"TextMetadata",
|
||||
"TokenReductionConfig",
|
||||
"TokenReductionOptions",
|
||||
"TreeSitterConfig",
|
||||
"TreeSitterProcessConfig",
|
||||
"UnsupportedFormatError",
|
||||
"UriKind",
|
||||
"ValidationError",
|
||||
"XlsxAppProperties",
|
||||
"XmlExtractionResult",
|
||||
"XmlMetadata",
|
||||
"YakeParams",
|
||||
"YearRange",
|
||||
"batch_extract_bytes",
|
||||
"batch_extract_bytes_sync",
|
||||
"batch_extract_files",
|
||||
"batch_extract_files_sync",
|
||||
"clear_document_extractors",
|
||||
"clear_embedding_backends",
|
||||
"clear_ocr_backends",
|
||||
"clear_post_processors",
|
||||
"clear_renderers",
|
||||
"clear_validators",
|
||||
"compare",
|
||||
"detect_mime_type",
|
||||
"detect_mime_type_from_bytes",
|
||||
"embed_texts",
|
||||
"embed_texts_async",
|
||||
"extract_bytes",
|
||||
"extract_bytes_sync",
|
||||
"extract_file",
|
||||
"extract_file_sync",
|
||||
"get_embedding_preset",
|
||||
"get_extensions_for_mime",
|
||||
"list_document_extractors",
|
||||
"list_embedding_backends",
|
||||
"list_embedding_presets",
|
||||
"list_ocr_backends",
|
||||
"list_post_processors",
|
||||
"list_renderers",
|
||||
"list_validators",
|
||||
"register_document_extractor",
|
||||
"register_embedding_backend",
|
||||
"register_ocr_backend",
|
||||
"register_post_processor",
|
||||
"register_renderer",
|
||||
"register_validator",
|
||||
"render_pdf_page_to_png",
|
||||
"unregister_document_extractor",
|
||||
"unregister_embedding_backend",
|
||||
"unregister_ocr_backend",
|
||||
"unregister_post_processor",
|
||||
"unregister_renderer",
|
||||
"unregister_validator",
|
||||
]
|
||||
|
||||
__version__ = "5.0.0-rc.3"
|
||||
3057
packages/python/kreuzberg/_kreuzberg.pyi
generated
Normal file
3057
packages/python/kreuzberg/_kreuzberg.pyi
generated
Normal file
File diff suppressed because it is too large
Load Diff
1171
packages/python/kreuzberg/api.py
generated
Normal file
1171
packages/python/kreuzberg/api.py
generated
Normal file
File diff suppressed because it is too large
Load Diff
74
packages/python/kreuzberg/exceptions.py
generated
Normal file
74
packages/python/kreuzberg/exceptions.py
generated
Normal file
@@ -0,0 +1,74 @@
|
||||
# This file is auto-generated by alef — DO NOT EDIT.
|
||||
# alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||||
# To regenerate: alef generate
|
||||
# To verify freshness: alef verify --exit-code
|
||||
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
||||
"""Exception hierarchy."""
|
||||
|
||||
|
||||
class KreuzbergError(Exception):
|
||||
"""Main error type for all Kreuzberg operations."""
|
||||
|
||||
|
||||
class IoError(KreuzbergError):
|
||||
"""Io error."""
|
||||
|
||||
|
||||
class ParsingError(KreuzbergError):
|
||||
"""Parsing error."""
|
||||
|
||||
|
||||
class OcrError(KreuzbergError):
|
||||
"""Ocr error."""
|
||||
|
||||
|
||||
class ValidationError(KreuzbergError):
|
||||
"""Validation error."""
|
||||
|
||||
|
||||
class CacheError(KreuzbergError):
|
||||
"""Cache error."""
|
||||
|
||||
|
||||
class ImageProcessingError(KreuzbergError):
|
||||
"""Image processing error."""
|
||||
|
||||
|
||||
class SerializationError(KreuzbergError):
|
||||
"""Serialization error."""
|
||||
|
||||
|
||||
class MissingDependencyError(KreuzbergError):
|
||||
"""Missing dependency error."""
|
||||
|
||||
|
||||
class PluginError(KreuzbergError):
|
||||
"""Plugin error."""
|
||||
|
||||
|
||||
class LockPoisonedError(KreuzbergError):
|
||||
"""Lock poisoned error."""
|
||||
|
||||
|
||||
class UnsupportedFormatError(KreuzbergError):
|
||||
"""Unsupported format error."""
|
||||
|
||||
|
||||
class EmbeddingError(KreuzbergError):
|
||||
"""Embedding error."""
|
||||
|
||||
|
||||
class KreuzbergTimeoutError(KreuzbergError):
|
||||
"""Kreuzberg timeout error."""
|
||||
|
||||
|
||||
class CancelledError(KreuzbergError):
|
||||
"""Cancelled error."""
|
||||
|
||||
|
||||
class SecurityError(KreuzbergError):
|
||||
"""Security error."""
|
||||
|
||||
|
||||
class OtherError(KreuzbergError):
|
||||
"""Other error."""
|
||||
2671
packages/python/kreuzberg/options.py
generated
Normal file
2671
packages/python/kreuzberg/options.py
generated
Normal file
File diff suppressed because it is too large
Load Diff
0
packages/python/kreuzberg/py.typed
generated
Normal file
0
packages/python/kreuzberg/py.typed
generated
Normal file
98
packages/python/pyproject.toml
generated
Normal file
98
packages/python/pyproject.toml
generated
Normal file
@@ -0,0 +1,98 @@
|
||||
[build-system]
|
||||
build-backend = "maturin"
|
||||
requires = [ "maturin>=1,<2" ]
|
||||
|
||||
[project]
|
||||
name = "kreuzberg"
|
||||
version = "5.0.0rc3"
|
||||
description = "High-performance document intelligence library"
|
||||
keywords = [ "document", "extraction", "ocr", "pdf", "text" ]
|
||||
license = "Elastic-2.0"
|
||||
license-files = [ "LICENSE" ]
|
||||
authors = [ { name = "Na'aman Hirschfeld <naaman@kreuzberg.dev>" } ]
|
||||
requires-python = ">=3.10"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: 3.14",
|
||||
]
|
||||
urls.repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
||||
homepage = "https://kreuzberg.dev"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [ "mypy>=1.19", "ruff>=0.14.8" ]
|
||||
|
||||
[tool.maturin]
|
||||
module-name = "kreuzberg._kreuzberg"
|
||||
manifest-path = "../../crates/kreuzberg-py/Cargo.toml"
|
||||
# abi3-py310 produces a single wheel per platform that loads on Python 3.10+,
|
||||
# avoiding a per-Python-version build matrix.
|
||||
features = [ "pyo3/extension-module", "pyo3/abi3-py310" ]
|
||||
python-packages = [ "kreuzberg" ]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py310"
|
||||
line-length = 120
|
||||
format.docstring-code-line-length = 120
|
||||
format.docstring-code-format = true
|
||||
lint.select = [ "ALL" ]
|
||||
lint.ignore = [
|
||||
"ANN401",
|
||||
"ASYNC109",
|
||||
"ASYNC110",
|
||||
"BLE001",
|
||||
"COM812",
|
||||
"D100",
|
||||
"D104",
|
||||
"D107",
|
||||
"D205",
|
||||
"E501",
|
||||
"EM",
|
||||
"FBT",
|
||||
"FIX",
|
||||
"ISC001",
|
||||
"PD011",
|
||||
"PGH003",
|
||||
"PLR2004",
|
||||
"PLW0603",
|
||||
"S104",
|
||||
"S110",
|
||||
"S603",
|
||||
"TD",
|
||||
"TRY",
|
||||
]
|
||||
lint.per-file-ignores."kreuzberg/__init__.py" = [ "I001" ]
|
||||
# The alef Python codegen still emits cosmetic warnings on the wrapper
|
||||
# modules: api.py keeps the legacy `from typing import AsyncIterator` and a
|
||||
# single-line import block, options.py carries # noqa: TC001 / F401 markers
|
||||
# that turn out unused on every regen, __init__.py star-imports re-sort with
|
||||
# a different convention. Silence these specific rules on the wrappers until
|
||||
# the codegen is updated to emit ruff-clean output.
|
||||
lint.per-file-ignores."kreuzberg/api.py" = [ "F401", "I001", "UP035" ]
|
||||
lint.per-file-ignores."kreuzberg/options.py" = [ "F401", "RUF100" ]
|
||||
lint.per-file-ignores."tests/**" = [ "ANN", "D103", "PLR2004", "S101" ]
|
||||
lint.mccabe.max-complexity = 15
|
||||
lint.pydocstyle.convention = "google"
|
||||
lint.pylint.max-args = 10
|
||||
lint.pylint.max-branches = 15
|
||||
lint.pylint.max-returns = 10
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.10"
|
||||
strict = true
|
||||
show_error_codes = true
|
||||
implicit_reexport = false
|
||||
namespace_packages = true
|
||||
overrides = [
|
||||
# The alef-emitted `api.py` wrapper has a structural mismatch between its
|
||||
# `options.*` dataclass signatures and the `_internal_bindings.*` pyclass
|
||||
# types pyo3 accepts/returns at runtime. pyo3 reconciles them dynamically via
|
||||
# FromPyObject — the Python e2e suite exercises the runtime path — but mypy
|
||||
# sees only the static-type discrepancy. Disable the four error codes the
|
||||
# discrepancy raises until the codegen emits matching `_to_rust_*` calls and
|
||||
# casts the return values.
|
||||
{ module = "kreuzberg.api", disable_error_code = [ "call-arg", "arg-type", "return-value", "attr-defined" ] },
|
||||
]
|
||||
Reference in New Issue
Block a user