Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
```python title="Python"
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
config = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.9,
detect_multiple=True,
),
)
print(config.language_detection)
```

View File

@@ -0,0 +1,16 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
config = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.8,
detect_multiple=True,
),
)
result = extract_file_sync("multilingual_document.pdf", config=config)
if result.detected_languages:
print(f"Detected languages: {', '.join(result.detected_languages)}")
```

View File

@@ -0,0 +1,80 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata.get("page_count"):
print(f"Pages: {metadata['page_count']}")
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("authors"):
print(f"Authors: {', '.join(metadata['authors'])}")
result = extract_file_sync("page.html", config=ExtractionConfig())
metadata = result.metadata
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("description"):
print(f"Description: {metadata['description']}")
# Access keywords as array
keywords = metadata.get('keywords', [])
if keywords:
print(f"Keywords: {', '.join(keywords)}")
# Access canonical URL (renamed from canonical)
canonical_url = metadata.get('canonical_url')
if canonical_url:
print(f"Canonical URL: {canonical_url}")
# Access Open Graph fields from map
open_graph = metadata.get('open_graph', {})
if open_graph:
if 'image' in open_graph:
print(f"Open Graph Image: {open_graph['image']}")
if 'title' in open_graph:
print(f"Open Graph Title: {open_graph['title']}")
if 'type' in open_graph:
print(f"Open Graph Type: {open_graph['type']}")
# Access Twitter Card fields from map
twitter_card = metadata.get('twitter_card', {})
if twitter_card:
if 'card' in twitter_card:
print(f"Twitter Card Type: {twitter_card['card']}")
if 'creator' in twitter_card:
print(f"Twitter Creator: {twitter_card['creator']}")
# Access new fields
language = metadata.get('language')
if language:
print(f"Language: {language}")
text_direction = metadata.get('text_direction')
if text_direction:
print(f"Text Direction: {text_direction}")
# Access headers
headers = metadata.get('headers', [])
if headers:
print(f"Headers: {', '.join([h['text'] for h in headers])}")
# Access links
links = metadata.get('links', [])
if links:
for link in links:
print(f"Link: {link.get('href')} ({link.get('text')})")
# Access images
images = metadata.get('images', [])
if images:
for image in images:
print(f"Image: {image.get('src')}")
# Access structured data
structured_data = metadata.get('structured_data', [])
if structured_data:
print(f"Structured data items: {len(structured_data)}")
```

View File

@@ -0,0 +1,17 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
if result.metadata.pages and result.metadata.pages.boundaries:
boundaries = result.metadata.pages.boundaries
content_bytes = result.content.encode("utf-8")
for boundary in boundaries[:3]:
page_bytes = content_bytes[boundary.byte_start:boundary.byte_end]
page_text = page_bytes.decode("utf-8")
print(f"Page {boundary.page_number}:")
print(f" Byte range: {boundary.byte_start}-{boundary.byte_end}")
print(f" Preview: {page_text[:100]}...")
```

View File

@@ -0,0 +1,14 @@
From Kreuzberg import extract_file_sync, ExtractionConfig, PageConfig
Config = ExtractionConfig(
pages=PageConfig(extract_pages=True)
)
Result = extract_file_sync("document.pdf", config=config)
If result.pages:
for page in result.pages:
print(f"Page {page.page_number}:")
print(f" Content: {len(page.content)} chars")
print(f" Tables: {len(page.tables)}")
print(f" Images: {len(page.images)}")

View File

@@ -0,0 +1,39 @@
```python title="Python"
from kreuzberg import register_post_processor, ExtractionResult
import logging
logger = logging.getLogger(__name__)
class PdfMetadataExtractor:
def __init__(self):
self.processed_count: int = 0
def name(self) -> str:
return "pdf_metadata_extractor"
def version(self) -> str:
return "1.0.0"
def description(self) -> str:
return "Extracts and enriches PDF metadata"
def processing_stage(self) -> str:
return "early"
def should_process(self, result: ExtractionResult) -> bool:
return result.mime_type == "application/pdf"
def process(self, result: ExtractionResult) -> ExtractionResult:
self.processed_count += 1
result.metadata["pdf_processed"] = True
return result
def initialize(self) -> None:
logger.info("PDF metadata extractor initialized")
def shutdown(self) -> None:
logger.info(f"Processed {self.processed_count} PDFs")
processor: PdfMetadataExtractor = PdfMetadataExtractor()
register_post_processor(processor)
```

View File

@@ -0,0 +1,35 @@
```python title="Python"
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
config = ExtractionConfig(
chunking=ChunkingConfig(
max_characters=512,
overlap=50,
embedding=EmbeddingConfig(
normalize=True,
batch_size=32,
preset="balanced",
),
),
)
result = extract_file_sync("document.pdf", config=config)
records: list[dict] = []
if result.chunks:
for index, chunk in enumerate(result.chunks):
if chunk.embedding is None:
continue
records.append({
"id": f"document_chunk_{index}",
"content": chunk.content,
"embedding": chunk.embedding,
"metadata": {
"document_id": "document.pdf",
"chunk_index": index,
"content_length": len(chunk.content),
},
})
print(f"Prepared {len(records)} vector records")
```