This commit is contained in:
59
docs/snippets/swift/config/advanced_config.md
Normal file
59
docs/snippets/swift/config/advanced_config.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Build a fully-featured `ExtractionConfig` via JSON. ExtractionConfig has
|
||||
// 30+ fields, so JSON is the ergonomic path for non-trivial configs.
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 32,
|
||||
"normalize": true,
|
||||
"show_download_progress": false
|
||||
}
|
||||
},
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
},
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.1,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
},
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_important_words": true
|
||||
},
|
||||
"postprocessor": {
|
||||
"enabled": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content: \(result.content().toString())")
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Languages: \(langs)")
|
||||
}
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
}
|
||||
```
|
||||
27
docs/snippets/swift/config/chunking_config.md
Normal file
27
docs/snippets/swift/config/chunking_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 100,
|
||||
"chunker_type": "markdown",
|
||||
"prepend_heading_context": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.md", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for chunk in chunks {
|
||||
let content = chunk.content().toString()
|
||||
print("Length: \(content.count)")
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/swift/config/config_basic.md
Normal file
17
docs/snippets/swift/config/config_basic.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
21
docs/snippets/swift/config/config_discover.md
Normal file
21
docs/snippets/swift/config/config_discover.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Swift bindings build configs via JSON. To honor an on-disk
|
||||
// `kreuzberg.{toml,yaml,json}`, load the file and pass its JSON
|
||||
// representation to `extractionConfigFromJson`. Unknown formats
|
||||
// can be normalized to JSON on the caller side.
|
||||
let configJson: String
|
||||
if let data = try? Data(contentsOf: URL(fileURLWithPath: "kreuzberg.json")),
|
||||
let text = String(data: data, encoding: .utf8) {
|
||||
configJson = text
|
||||
} else {
|
||||
configJson = "{}"
|
||||
}
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
print(result.content().toString())
|
||||
```
|
||||
20
docs/snippets/swift/config/config_ocr.md
Normal file
20
docs/snippets/swift/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
print("Tables detected: \(result.tables().count)")
|
||||
```
|
||||
28
docs/snippets/swift/config/config_programmatic.md
Normal file
28
docs/snippets/swift/config/config_programmatic.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng+deu",
|
||||
"tesseract_config": {
|
||||
"psm": 6
|
||||
}
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
18
docs/snippets/swift/config/document_structure_config.md
Normal file
18
docs/snippets/swift/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"include_document_structure": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let document = result.document() {
|
||||
print("Document nodes: \(document.nodes().count)")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/swift/config/element_based_output.md
Normal file
22
docs/snippets/swift/config/element_based_output.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"result_format": "element_based"
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let elements = result.elements() {
|
||||
print("Elements: \(elements.count)")
|
||||
for element in elements {
|
||||
print("Type: \(element.element_type().toString())")
|
||||
print("Text: \(element.text().toString().prefix(100))")
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/swift/config/embedding_config.md
Normal file
27
docs/snippets/swift/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 16,
|
||||
"normalize": true,
|
||||
"show_download_progress": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks with embeddings: \(chunks.count)")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/swift/config/html_output.md
Normal file
19
docs/snippets/swift/config/html_output.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"output_format": "html",
|
||||
"html_output": {
|
||||
"theme": "github"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString()) // HTML with kb-* classes
|
||||
```
|
||||
23
docs/snippets/swift/config/keyword_extraction_config.md
Normal file
23
docs/snippets/swift/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.1,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Keywords extracted from document")
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
23
docs/snippets/swift/config/language_detection_config.md
Normal file
23
docs/snippets/swift/config/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
23
docs/snippets/swift/config/ocr_dpi_config.md
Normal file
23
docs/snippets/swift/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"images": {
|
||||
"extract_images": true,
|
||||
"target_dpi": 300,
|
||||
"max_image_dimension": 4096,
|
||||
"auto_adjust_dpi": true,
|
||||
"min_dpi": 150,
|
||||
"max_dpi": 600
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
20
docs/snippets/swift/config/pdf_config.md
Normal file
20
docs/snippets/swift/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pdf_options": {
|
||||
"extract_images": true,
|
||||
"passwords": ["password123"],
|
||||
"extract_metadata": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("encrypted.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
24
docs/snippets/swift/config/pdf_hierarchy_config.md
Normal file
24
docs/snippets/swift/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pdf_options": {
|
||||
"hierarchy": {
|
||||
"enabled": true,
|
||||
"detection_threshold": 0.75,
|
||||
"ocr_coverage_threshold": 0.8,
|
||||
"min_level": 1,
|
||||
"max_level": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
19
docs/snippets/swift/config/postprocessor_config.md
Normal file
19
docs/snippets/swift/config/postprocessor_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"postprocessor": {
|
||||
"enabled": true,
|
||||
"enabled_processors": ["whitespace_normalizer", "unicode_normalizer"]
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Processed content: \(result.content().toString())")
|
||||
```
|
||||
18
docs/snippets/swift/config/quality_processing_config.md
Normal file
18
docs/snippets/swift/config/quality_processing_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true,
|
||||
"use_cache": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
print("Tables: \(result.tables().count)")
|
||||
```
|
||||
23
docs/snippets/swift/config/tesseract_config.md
Normal file
23
docs/snippets/swift/config/tesseract_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng+deu",
|
||||
"tesseract_config": {
|
||||
"psm": 6,
|
||||
"oem": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print("OCR text: \(result.content().toString())")
|
||||
```
|
||||
19
docs/snippets/swift/config/token_reduction_config.md
Normal file
19
docs/snippets/swift/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_important_words": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Reduced content length: \(result.content().toString().count)")
|
||||
```
|
||||
Reference in New Issue
Block a user