This commit is contained in:
13
docs/snippets/python/metadata/language_detection.md
Normal file
13
docs/snippets/python/metadata/language_detection.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.9,
|
||||
detect_multiple=True,
|
||||
),
|
||||
)
|
||||
|
||||
print(config.language_detection)
|
||||
```
|
||||
@@ -0,0 +1,16 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, LanguageDetectionConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
language_detection=LanguageDetectionConfig(
|
||||
enabled=True,
|
||||
min_confidence=0.8,
|
||||
detect_multiple=True,
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("multilingual_document.pdf", config=config)
|
||||
|
||||
if result.detected_languages:
|
||||
print(f"Detected languages: {', '.join(result.detected_languages)}")
|
||||
```
|
||||
80
docs/snippets/python/metadata/metadata.md
Normal file
80
docs/snippets/python/metadata/metadata.md
Normal file
@@ -0,0 +1,80 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
# Metadata is flat — format-specific fields are at the top level
|
||||
metadata = result.metadata
|
||||
if metadata.get("page_count"):
|
||||
print(f"Pages: {metadata['page_count']}")
|
||||
if metadata.get("title"):
|
||||
print(f"Title: {metadata['title']}")
|
||||
if metadata.get("authors"):
|
||||
print(f"Authors: {', '.join(metadata['authors'])}")
|
||||
|
||||
result = extract_file_sync("page.html", config=ExtractionConfig())
|
||||
metadata = result.metadata
|
||||
if metadata.get("title"):
|
||||
print(f"Title: {metadata['title']}")
|
||||
if metadata.get("description"):
|
||||
print(f"Description: {metadata['description']}")
|
||||
|
||||
# Access keywords as array
|
||||
keywords = metadata.get('keywords', [])
|
||||
if keywords:
|
||||
print(f"Keywords: {', '.join(keywords)}")
|
||||
|
||||
# Access canonical URL (renamed from canonical)
|
||||
canonical_url = metadata.get('canonical_url')
|
||||
if canonical_url:
|
||||
print(f"Canonical URL: {canonical_url}")
|
||||
|
||||
# Access Open Graph fields from map
|
||||
open_graph = metadata.get('open_graph', {})
|
||||
if open_graph:
|
||||
if 'image' in open_graph:
|
||||
print(f"Open Graph Image: {open_graph['image']}")
|
||||
if 'title' in open_graph:
|
||||
print(f"Open Graph Title: {open_graph['title']}")
|
||||
if 'type' in open_graph:
|
||||
print(f"Open Graph Type: {open_graph['type']}")
|
||||
|
||||
# Access Twitter Card fields from map
|
||||
twitter_card = metadata.get('twitter_card', {})
|
||||
if twitter_card:
|
||||
if 'card' in twitter_card:
|
||||
print(f"Twitter Card Type: {twitter_card['card']}")
|
||||
if 'creator' in twitter_card:
|
||||
print(f"Twitter Creator: {twitter_card['creator']}")
|
||||
|
||||
# Access new fields
|
||||
language = metadata.get('language')
|
||||
if language:
|
||||
print(f"Language: {language}")
|
||||
|
||||
text_direction = metadata.get('text_direction')
|
||||
if text_direction:
|
||||
print(f"Text Direction: {text_direction}")
|
||||
|
||||
# Access headers
|
||||
headers = metadata.get('headers', [])
|
||||
if headers:
|
||||
print(f"Headers: {', '.join([h['text'] for h in headers])}")
|
||||
|
||||
# Access links
|
||||
links = metadata.get('links', [])
|
||||
if links:
|
||||
for link in links:
|
||||
print(f"Link: {link.get('href')} ({link.get('text')})")
|
||||
|
||||
# Access images
|
||||
images = metadata.get('images', [])
|
||||
if images:
|
||||
for image in images:
|
||||
print(f"Image: {image.get('src')}")
|
||||
|
||||
# Access structured data
|
||||
structured_data = metadata.get('structured_data', [])
|
||||
if structured_data:
|
||||
print(f"Structured data items: {len(structured_data)}")
|
||||
```
|
||||
17
docs/snippets/python/metadata/page_boundaries.md
Normal file
17
docs/snippets/python/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig
|
||||
|
||||
result = extract_file_sync("document.pdf", config=ExtractionConfig())
|
||||
|
||||
if result.metadata.pages and result.metadata.pages.boundaries:
|
||||
boundaries = result.metadata.pages.boundaries
|
||||
content_bytes = result.content.encode("utf-8")
|
||||
|
||||
for boundary in boundaries[:3]:
|
||||
page_bytes = content_bytes[boundary.byte_start:boundary.byte_end]
|
||||
page_text = page_bytes.decode("utf-8")
|
||||
|
||||
print(f"Page {boundary.page_number}:")
|
||||
print(f" Byte range: {boundary.byte_start}-{boundary.byte_end}")
|
||||
print(f" Preview: {page_text[:100]}...")
|
||||
```
|
||||
14
docs/snippets/python/metadata/page_tracking_basic.md
Normal file
14
docs/snippets/python/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,14 @@
|
||||
From Kreuzberg import extract_file_sync, ExtractionConfig, PageConfig
|
||||
|
||||
Config = ExtractionConfig(
|
||||
pages=PageConfig(extract_pages=True)
|
||||
)
|
||||
|
||||
Result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
If result.pages:
|
||||
for page in result.pages:
|
||||
print(f"Page {page.page_number}:")
|
||||
print(f" Content: {len(page.content)} chars")
|
||||
print(f" Tables: {len(page.tables)}")
|
||||
print(f" Images: {len(page.images)}")
|
||||
39
docs/snippets/python/metadata/pdf_metadata_extractor.md
Normal file
39
docs/snippets/python/metadata/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import register_post_processor, ExtractionResult
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class PdfMetadataExtractor:
|
||||
def __init__(self):
|
||||
self.processed_count: int = 0
|
||||
|
||||
def name(self) -> str:
|
||||
return "pdf_metadata_extractor"
|
||||
|
||||
def version(self) -> str:
|
||||
return "1.0.0"
|
||||
|
||||
def description(self) -> str:
|
||||
return "Extracts and enriches PDF metadata"
|
||||
|
||||
def processing_stage(self) -> str:
|
||||
return "early"
|
||||
|
||||
def should_process(self, result: ExtractionResult) -> bool:
|
||||
return result.mime_type == "application/pdf"
|
||||
|
||||
def process(self, result: ExtractionResult) -> ExtractionResult:
|
||||
self.processed_count += 1
|
||||
result.metadata["pdf_processed"] = True
|
||||
return result
|
||||
|
||||
def initialize(self) -> None:
|
||||
logger.info("PDF metadata extractor initialized")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
logger.info(f"Processed {self.processed_count} PDFs")
|
||||
|
||||
processor: PdfMetadataExtractor = PdfMetadataExtractor()
|
||||
register_post_processor(processor)
|
||||
```
|
||||
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
35
docs/snippets/python/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```python title="Python"
|
||||
from kreuzberg import extract_file_sync, ExtractionConfig, ChunkingConfig, EmbeddingConfig
|
||||
|
||||
config = ExtractionConfig(
|
||||
chunking=ChunkingConfig(
|
||||
max_characters=512,
|
||||
overlap=50,
|
||||
embedding=EmbeddingConfig(
|
||||
normalize=True,
|
||||
batch_size=32,
|
||||
preset="balanced",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
result = extract_file_sync("document.pdf", config=config)
|
||||
|
||||
records: list[dict] = []
|
||||
if result.chunks:
|
||||
for index, chunk in enumerate(result.chunks):
|
||||
if chunk.embedding is None:
|
||||
continue
|
||||
records.append({
|
||||
"id": f"document_chunk_{index}",
|
||||
"content": chunk.content,
|
||||
"embedding": chunk.embedding,
|
||||
"metadata": {
|
||||
"document_id": "document.pdf",
|
||||
"chunk_index": index,
|
||||
"content_length": len(chunk.content),
|
||||
},
|
||||
})
|
||||
|
||||
print(f"Prepared {len(records)} vector records")
|
||||
```
|
||||
Reference in New Issue
Block a user