60 lines
1.4 KiB
Markdown
60 lines
1.4 KiB
Markdown
```swift title="Swift"
|
|
import Foundation
|
|
import Kreuzberg
|
|
import RustBridge
|
|
|
|
// Build a fully-featured `ExtractionConfig` via JSON. ExtractionConfig has
|
|
// 30+ fields, so JSON is the ergonomic path for non-trivial configs.
|
|
let configJson = """
|
|
{
|
|
"use_cache": true,
|
|
"enable_quality_processing": true,
|
|
"ocr": {
|
|
"backend": "tesseract",
|
|
"language": "eng"
|
|
},
|
|
"chunking": {
|
|
"max_characters": 1000,
|
|
"overlap": 200,
|
|
"embedding": {
|
|
"model": {"preset": {"name": "balanced"}},
|
|
"batch_size": 32,
|
|
"normalize": true,
|
|
"show_download_progress": false
|
|
}
|
|
},
|
|
"language_detection": {
|
|
"enabled": true,
|
|
"min_confidence": 0.8,
|
|
"detect_multiple": false
|
|
},
|
|
"keywords": {
|
|
"algorithm": "yake",
|
|
"max_keywords": 10,
|
|
"min_score": 0.1,
|
|
"ngram_range": [1, 3],
|
|
"language": "en"
|
|
},
|
|
"token_reduction": {
|
|
"mode": "moderate",
|
|
"preserve_important_words": true
|
|
},
|
|
"postprocessor": {
|
|
"enabled": true
|
|
}
|
|
}
|
|
"""
|
|
|
|
let config = try extractionConfigFromJson(configJson)
|
|
let result = try extractFileSync("document.pdf", nil, config)
|
|
|
|
print("Content: \(result.content().toString())")
|
|
if let languages = result.detected_languages() {
|
|
let langs = languages.map { $0.toString() }
|
|
print("Languages: \(langs)")
|
|
}
|
|
if let chunks = result.chunks() {
|
|
print("Chunks: \(chunks.count)")
|
|
}
|
|
```
|