This commit is contained in:
34
docs/snippets/swift/advanced/chunk_page_mapping.md
Normal file
34
docs/snippets/swift/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50
|
||||
},
|
||||
"pages": {
|
||||
"extract_pages": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
let content = chunk.content().toString()
|
||||
let preview = String(content.prefix(50))
|
||||
if let first = metadata.first_page(), let last = metadata.last_page() {
|
||||
let pageRange = first == last ? "Page \(first)" : "Pages \(first)-\(last)"
|
||||
print("Chunk: \(preview)... (\(pageRange))")
|
||||
} else {
|
||||
print("Chunk: \(preview)... (no page info)")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/swift/advanced/chunking_config.md
Normal file
25
docs/snippets/swift/advanced/chunking_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/swift/advanced/chunking_rag.md
Normal file
35
docs/snippets/swift/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
let content = chunk.content().toString()
|
||||
let preview = String(content.prefix(100))
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
print("Position: \(metadata.byte_start())-\(metadata.byte_end())")
|
||||
print("Content: \(preview)...")
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Embedding: \(embedding.count) dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
32
docs/snippets/swift/advanced/embedding_with_chunking.md
Normal file
32
docs/snippets/swift/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1024,
|
||||
"overlap": 100,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32,
|
||||
"show_download_progress": false
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Generated \(chunks.count) chunks")
|
||||
for chunk in chunks {
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Chunk \(chunk.metadata().chunk_index()) -> \(embedding.count)-dim embedding")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/swift/advanced/keyword_extraction_config.md
Normal file
24
docs/snippets/swift/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.3,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let keywords = result.extracted_keywords() {
|
||||
print("Extracted \(keywords.count) keywords")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/swift/advanced/keyword_extraction_example.md
Normal file
26
docs/snippets/swift/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.3
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let keywords = result.extracted_keywords() {
|
||||
for keyword in keywords {
|
||||
let text = keyword.text().toString()
|
||||
let score = keyword.score()
|
||||
print("\(text) (score: \(score))")
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/swift/advanced/language_detection_config.md
Normal file
23
docs/snippets/swift/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected: \(langs)")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("multilingual_document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
18
docs/snippets/swift/advanced/quality_processing_config.md
Normal file
18
docs/snippets/swift/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let score = result.quality_score() {
|
||||
print(String(format: "Quality score: %.2f", score))
|
||||
}
|
||||
```
|
||||
22
docs/snippets/swift/advanced/quality_processing_example.md
Normal file
22
docs/snippets/swift/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned_document.pdf", nil, config)
|
||||
|
||||
if let score = result.quality_score() {
|
||||
if score < 0.5 {
|
||||
print(String(format: "Warning: Low quality extraction (%.2f)", score))
|
||||
} else {
|
||||
print(String(format: "Quality score: %.2f", score))
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/swift/advanced/token_reduction_config.md
Normal file
21
docs/snippets/swift/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true,
|
||||
"preserve_code": true,
|
||||
"language_hint": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Reduced content length: \(result.content().toString().count)")
|
||||
```
|
||||
23
docs/snippets/swift/advanced/token_reduction_example.md
Normal file
23
docs/snippets/swift/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("verbose_document.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
print("Reduced content length: \(content.count)")
|
||||
for warning in result.processing_warnings() {
|
||||
print("Warning [\(warning.source().toString())]: \(warning.message().toString())")
|
||||
}
|
||||
```
|
||||
51
docs/snippets/swift/advanced/vector_database_integration.md
Normal file
51
docs/snippets/swift/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,51 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
struct VectorRecord {
|
||||
let id: String
|
||||
let content: String
|
||||
let embedding: [Float]
|
||||
let metadata: [String: String]
|
||||
}
|
||||
|
||||
func extractAndVectorize(documentPath: String, documentId: String) throws -> [VectorRecord] {
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 512,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync(documentPath, nil, config)
|
||||
|
||||
var records: [VectorRecord] = []
|
||||
if let chunks = result.chunks() {
|
||||
for (index, chunk) in chunks.enumerated() {
|
||||
guard let embedding = chunk.embedding() else { continue }
|
||||
let content = chunk.content().toString()
|
||||
let metadata: [String: String] = [
|
||||
"document_id": documentId,
|
||||
"chunk_index": String(index),
|
||||
"content_length": String(content.count),
|
||||
]
|
||||
records.append(VectorRecord(
|
||||
id: "\(documentId)_chunk_\(index)",
|
||||
content: content,
|
||||
embedding: embedding.map { $0 },
|
||||
metadata: metadata
|
||||
))
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
31
docs/snippets/swift/api/batch_extract_bytes_sync.md
Normal file
31
docs/snippets/swift/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// `BatchBytesItem` is an opaque swift-bridge class with no public Swift
|
||||
// constructor — build items from JSON via `batchBytesItemFromJson`.
|
||||
// `content` must be encoded as a JSON byte array.
|
||||
func encodeBytesAsJsonArray(_ bytes: [UInt8]) -> String {
|
||||
"[" + bytes.map { String($0) }.joined(separator: ",") + "]"
|
||||
}
|
||||
|
||||
let items = RustVec<BatchBytesItem>()
|
||||
|
||||
let first = Array("Hello, world!".utf8)
|
||||
items.push(value: try batchBytesItemFromJson(
|
||||
"{\"content\": \(encodeBytesAsJsonArray(first)), \"mime_type\": \"text/plain\"}"
|
||||
))
|
||||
|
||||
let second = Array("# Heading\n\nParagraph text.".utf8)
|
||||
items.push(value: try batchBytesItemFromJson(
|
||||
"{\"content\": \(encodeBytesAsJsonArray(second)), \"mime_type\": \"text/markdown\"}"
|
||||
))
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let results = try batchExtractBytesSync(items, config)
|
||||
|
||||
for (index, result) in results.enumerated() {
|
||||
print("Item \(index): \(result.content().toString().count) chars")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/swift/api/batch_extract_files_sync.md
Normal file
20
docs/snippets/swift/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// `BatchFileItem` is an opaque swift-bridge class with no public Swift
|
||||
// constructor — build items from JSON via `batchFileItemFromJson`.
|
||||
let items = RustVec<BatchFileItem>()
|
||||
for path in ["doc1.pdf", "doc2.docx", "report.pdf"] {
|
||||
let json = "{\"path\": \"\(path)\"}"
|
||||
items.push(value: try batchFileItemFromJson(json))
|
||||
}
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let results = try batchExtractFilesSync(items, config)
|
||||
|
||||
for (index, result) in results.enumerated() {
|
||||
print("File \(index): \(result.content().toString().count) chars")
|
||||
}
|
||||
```
|
||||
44
docs/snippets/swift/api/client_chunk_text.md
Normal file
44
docs/snippets/swift/api/client_chunk_text.md
Normal file
@@ -0,0 +1,44 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
#if canImport(FoundationNetworking)
|
||||
import FoundationNetworking
|
||||
#endif
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() async throws {
|
||||
let payload: [String: Any] = [
|
||||
"text": "Your long text content here...",
|
||||
"chunker_type": "text",
|
||||
"config": [
|
||||
"max_characters": 1000,
|
||||
"overlap": 50,
|
||||
"trim": true,
|
||||
],
|
||||
]
|
||||
|
||||
var request = URLRequest(url: URL(string: "http://localhost:8000/chunk")!)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
request.httpBody = try JSONSerialization.data(withJSONObject: payload)
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: request)
|
||||
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
|
||||
throw NSError(domain: "kreuzberg", code: 1)
|
||||
}
|
||||
|
||||
let result = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
|
||||
let chunkCount = result["chunk_count"] as? Int ?? 0
|
||||
print("Created \(chunkCount) chunks")
|
||||
|
||||
if let chunks = result["chunks"] as? [[String: Any]] {
|
||||
for chunk in chunks {
|
||||
let content = chunk["content"] as? String ?? ""
|
||||
let index = chunk["chunk_index"] as? Int ?? 0
|
||||
let preview = String(content.prefix(50))
|
||||
print("Chunk \(index): \(preview)...")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
42
docs/snippets/swift/api/client_extract_single_file.md
Normal file
42
docs/snippets/swift/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
#if canImport(FoundationNetworking)
|
||||
import FoundationNetworking
|
||||
#endif
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() async throws {
|
||||
let fileURL = URL(fileURLWithPath: "document.pdf")
|
||||
let fileData = try Data(contentsOf: fileURL)
|
||||
let fileName = fileURL.lastPathComponent
|
||||
|
||||
let boundary = "Boundary-\(UUID().uuidString)"
|
||||
var request = URLRequest(url: URL(string: "http://localhost:8000/extract")!)
|
||||
request.httpMethod = "POST"
|
||||
request.setValue(
|
||||
"multipart/form-data; boundary=\(boundary)",
|
||||
forHTTPHeaderField: "Content-Type"
|
||||
)
|
||||
|
||||
var body = Data()
|
||||
body.append("--\(boundary)\r\n".data(using: .utf8)!)
|
||||
body.append(
|
||||
"Content-Disposition: form-data; name=\"file\"; filename=\"\(fileName)\"\r\n"
|
||||
.data(using: .utf8)!
|
||||
)
|
||||
body.append("Content-Type: application/pdf\r\n\r\n".data(using: .utf8)!)
|
||||
body.append(fileData)
|
||||
body.append("\r\n--\(boundary)--\r\n".data(using: .utf8)!)
|
||||
request.httpBody = body
|
||||
|
||||
let (data, response) = try await URLSession.shared.data(for: request)
|
||||
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
|
||||
throw NSError(domain: "kreuzberg", code: 1)
|
||||
}
|
||||
|
||||
let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
|
||||
print(json["content"] as? String ?? "")
|
||||
}
|
||||
}
|
||||
```
|
||||
53
docs/snippets/swift/api/combining_all_features.md
Normal file
53
docs/snippets/swift/api/combining_all_features.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Build a fully-featured `ExtractionConfig` via JSON. The opaque swift-bridge
|
||||
// initializer takes 30+ positional parameters, so JSON is the ergonomic path
|
||||
// for non-trivial configs.
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"force_ocr": false,
|
||||
"chunking": {
|
||||
"max_characters": 800,
|
||||
"overlap": 100,
|
||||
"chunker_type": "markdown",
|
||||
"prepend_heading_context": true
|
||||
},
|
||||
"images": {
|
||||
"extract_images": true
|
||||
},
|
||||
"output_format": "markdown",
|
||||
"include_document_structure": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("report.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
print("Content (\(content.count) chars):")
|
||||
let preview = String(content.prefix(200))
|
||||
print(preview)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("\nChunks: \(chunks.count)")
|
||||
}
|
||||
print("Tables: \(result.tables().count)")
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Languages: \(langs)")
|
||||
}
|
||||
|
||||
if let method = result.extraction_method() {
|
||||
print("Extraction method: \(method)")
|
||||
}
|
||||
```
|
||||
31
docs/snippets/swift/api/error_handling.md
Normal file
31
docs/snippets/swift/api/error_handling.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// The Swift binding throws `RustString` (not `KreuzbergError`) for every
|
||||
// failure surfaced from the Rust core. The string preserves the original
|
||||
// error variant name and message (e.g. "UnsupportedFormat: ...",
|
||||
// "MissingDependency: ...", "Parsing: ...") so callers can pattern-match
|
||||
// on the prefix or simply print the message.
|
||||
do {
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
print(result.content().toString())
|
||||
} catch let error as RustString {
|
||||
let message = error.toString()
|
||||
if message.contains("UnsupportedFormat") {
|
||||
print("Unsupported format: \(message)")
|
||||
} else if message.contains("MissingDependency") {
|
||||
print("Install the required dependency: \(message)")
|
||||
} else if message.contains("Parsing") {
|
||||
print("Corrupt or invalid document: \(message)")
|
||||
} else if message.contains("Io") {
|
||||
print("File error: \(message)")
|
||||
} else {
|
||||
print("Extraction failed: \(message)")
|
||||
}
|
||||
} catch {
|
||||
print("Unexpected error: \(error)")
|
||||
}
|
||||
```
|
||||
32
docs/snippets/swift/api/error_handling_extract.md
Normal file
32
docs/snippets/swift/api/error_handling_extract.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
func extractText(bytes: [UInt8], mimeType: String) throws -> String {
|
||||
let content = RustVec<UInt8>()
|
||||
for byte in bytes { content.push(value: byte) }
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractBytesSync(content, mimeType, config)
|
||||
return result.content().toString()
|
||||
}
|
||||
|
||||
let data = (try? Data(contentsOf: URL(fileURLWithPath: "document.pdf"))) ?? Data()
|
||||
let bytes = Array(data)
|
||||
|
||||
do {
|
||||
let text = try extractText(bytes: bytes, mimeType: "application/pdf")
|
||||
print("Extracted \(text.count) chars")
|
||||
} catch let error as RustString {
|
||||
let message = error.toString()
|
||||
if message.contains("UnsupportedFormat") {
|
||||
print("Format not supported: \(message)")
|
||||
} else if message.contains("Ocr") {
|
||||
print("OCR failed: \(message)")
|
||||
} else {
|
||||
print("Error: \(message)")
|
||||
}
|
||||
} catch {
|
||||
print("Unexpected error: \(error)")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/swift/api/extract_bytes_async.md
Normal file
20
docs/snippets/swift/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() async throws {
|
||||
let data = try Data(contentsOf: URL(fileURLWithPath: "document.pdf"))
|
||||
let content = RustVec<UInt8>()
|
||||
for byte in data { content.push(value: byte) }
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try await extractBytes(content, "application/pdf", config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("Tables: \(result.tables().count)")
|
||||
}
|
||||
}
|
||||
```
|
||||
15
docs/snippets/swift/api/extract_bytes_sync.md
Normal file
15
docs/snippets/swift/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,15 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let data = try Data(contentsOf: URL(fileURLWithPath: "document.pdf"))
|
||||
let content = RustVec<UInt8>()
|
||||
for byte in data { content.push(value: byte) }
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractBytesSync(content, "application/pdf", config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("Tables: \(result.tables().count)")
|
||||
```
|
||||
20
docs/snippets/swift/api/extract_file_async.md
Normal file
20
docs/snippets/swift/api/extract_file_async.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
@main
|
||||
struct App {
|
||||
static func main() async throws {
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
// The Swift binding exposes async-compatible entrypoints; even though
|
||||
// the bridge calls are synchronous internally, callers may `await` them
|
||||
// to integrate with Swift Concurrency.
|
||||
let result = try await extractFile("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
print("Tables: \(result.tables().count)")
|
||||
}
|
||||
}
|
||||
```
|
||||
12
docs/snippets/swift/api/extract_file_sync.md
Normal file
12
docs/snippets/swift/api/extract_file_sync.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
print("Tables: \(result.tables().count)")
|
||||
```
|
||||
59
docs/snippets/swift/config/advanced_config.md
Normal file
59
docs/snippets/swift/config/advanced_config.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Build a fully-featured `ExtractionConfig` via JSON. ExtractionConfig has
|
||||
// 30+ fields, so JSON is the ergonomic path for non-trivial configs.
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 32,
|
||||
"normalize": true,
|
||||
"show_download_progress": false
|
||||
}
|
||||
},
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
},
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.1,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
},
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_important_words": true
|
||||
},
|
||||
"postprocessor": {
|
||||
"enabled": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content: \(result.content().toString())")
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Languages: \(langs)")
|
||||
}
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
}
|
||||
```
|
||||
27
docs/snippets/swift/config/chunking_config.md
Normal file
27
docs/snippets/swift/config/chunking_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 100,
|
||||
"chunker_type": "markdown",
|
||||
"prepend_heading_context": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.md", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for chunk in chunks {
|
||||
let content = chunk.content().toString()
|
||||
print("Length: \(content.count)")
|
||||
}
|
||||
}
|
||||
```
|
||||
17
docs/snippets/swift/config/config_basic.md
Normal file
17
docs/snippets/swift/config/config_basic.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
21
docs/snippets/swift/config/config_discover.md
Normal file
21
docs/snippets/swift/config/config_discover.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Swift bindings build configs via JSON. To honor an on-disk
|
||||
// `kreuzberg.{toml,yaml,json}`, load the file and pass its JSON
|
||||
// representation to `extractionConfigFromJson`. Unknown formats
|
||||
// can be normalized to JSON on the caller side.
|
||||
let configJson: String
|
||||
if let data = try? Data(contentsOf: URL(fileURLWithPath: "kreuzberg.json")),
|
||||
let text = String(data: data, encoding: .utf8) {
|
||||
configJson = text
|
||||
} else {
|
||||
configJson = "{}"
|
||||
}
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
print(result.content().toString())
|
||||
```
|
||||
20
docs/snippets/swift/config/config_ocr.md
Normal file
20
docs/snippets/swift/config/config_ocr.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
print("Tables detected: \(result.tables().count)")
|
||||
```
|
||||
28
docs/snippets/swift/config/config_programmatic.md
Normal file
28
docs/snippets/swift/config/config_programmatic.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"use_cache": true,
|
||||
"enable_quality_processing": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng+deu",
|
||||
"tesseract_config": {
|
||||
"psm": 6
|
||||
}
|
||||
},
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
18
docs/snippets/swift/config/document_structure_config.md
Normal file
18
docs/snippets/swift/config/document_structure_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"include_document_structure": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let document = result.document() {
|
||||
print("Document nodes: \(document.nodes().count)")
|
||||
}
|
||||
```
|
||||
22
docs/snippets/swift/config/element_based_output.md
Normal file
22
docs/snippets/swift/config/element_based_output.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"result_format": "element_based"
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let elements = result.elements() {
|
||||
print("Elements: \(elements.count)")
|
||||
for element in elements {
|
||||
print("Type: \(element.element_type().toString())")
|
||||
print("Text: \(element.text().toString().prefix(100))")
|
||||
}
|
||||
}
|
||||
```
|
||||
27
docs/snippets/swift/config/embedding_config.md
Normal file
27
docs/snippets/swift/config/embedding_config.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 16,
|
||||
"normalize": true,
|
||||
"show_download_progress": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks with embeddings: \(chunks.count)")
|
||||
}
|
||||
```
|
||||
19
docs/snippets/swift/config/html_output.md
Normal file
19
docs/snippets/swift/config/html_output.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"output_format": "html",
|
||||
"html_output": {
|
||||
"theme": "github"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString()) // HTML with kb-* classes
|
||||
```
|
||||
23
docs/snippets/swift/config/keyword_extraction_config.md
Normal file
23
docs/snippets/swift/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.1,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Keywords extracted from document")
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
23
docs/snippets/swift/config/language_detection_config.md
Normal file
23
docs/snippets/swift/config/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
23
docs/snippets/swift/config/ocr_dpi_config.md
Normal file
23
docs/snippets/swift/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"images": {
|
||||
"extract_images": true,
|
||||
"target_dpi": 300,
|
||||
"max_image_dimension": 4096,
|
||||
"auto_adjust_dpi": true,
|
||||
"min_dpi": 150,
|
||||
"max_dpi": 600
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
20
docs/snippets/swift/config/pdf_config.md
Normal file
20
docs/snippets/swift/config/pdf_config.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pdf_options": {
|
||||
"extract_images": true,
|
||||
"passwords": ["password123"],
|
||||
"extract_metadata": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("encrypted.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
24
docs/snippets/swift/config/pdf_hierarchy_config.md
Normal file
24
docs/snippets/swift/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pdf_options": {
|
||||
"hierarchy": {
|
||||
"enabled": true,
|
||||
"detection_threshold": 0.75,
|
||||
"ocr_coverage_threshold": 0.8,
|
||||
"min_level": 1,
|
||||
"max_level": 5
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
```
|
||||
19
docs/snippets/swift/config/postprocessor_config.md
Normal file
19
docs/snippets/swift/config/postprocessor_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"postprocessor": {
|
||||
"enabled": true,
|
||||
"enabled_processors": ["whitespace_normalizer", "unicode_normalizer"]
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Processed content: \(result.content().toString())")
|
||||
```
|
||||
18
docs/snippets/swift/config/quality_processing_config.md
Normal file
18
docs/snippets/swift/config/quality_processing_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true,
|
||||
"use_cache": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content length: \(result.content().toString().count)")
|
||||
print("Tables: \(result.tables().count)")
|
||||
```
|
||||
23
docs/snippets/swift/config/tesseract_config.md
Normal file
23
docs/snippets/swift/config/tesseract_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng+deu",
|
||||
"tesseract_config": {
|
||||
"psm": 6,
|
||||
"oem": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print("OCR text: \(result.content().toString())")
|
||||
```
|
||||
19
docs/snippets/swift/config/token_reduction_config.md
Normal file
19
docs/snippets/swift/config/token_reduction_config.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_important_words": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Reduced content length: \(result.content().toString().count)")
|
||||
```
|
||||
11
docs/snippets/swift/getting-started/basic_usage.md
Normal file
11
docs/snippets/swift/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,11 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
```
|
||||
12
docs/snippets/swift/getting-started/extract_file.md
Normal file
12
docs/snippets/swift/getting-started/extract_file.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Content: \(result.content().toString())")
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
print("Tables: \(result.tables().count)")
|
||||
```
|
||||
21
docs/snippets/swift/getting-started/extract_with_ocr.md
Normal file
21
docs/snippets/swift/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"force_ocr": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
```
|
||||
12
docs/snippets/swift/getting-started/hello_world.md
Normal file
12
docs/snippets/swift/getting-started/hello_world.md
Normal file
@@ -0,0 +1,12 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
print("Hello")
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("MIME type: \(result.mime_type().toString())")
|
||||
```
|
||||
9
docs/snippets/swift/getting-started/install_verify.md
Normal file
9
docs/snippets/swift/getting-started/install_verify.md
Normal file
@@ -0,0 +1,9 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
print("Kreuzberg Swift binding loaded successfully")
|
||||
print("Default config built: \(config)")
|
||||
```
|
||||
31
docs/snippets/swift/getting-started/read_content.md
Normal file
31
docs/snippets/swift/getting-started/read_content.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 800,
|
||||
"overlap": 100,
|
||||
"chunker_type": "markdown"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let tables = result.tables()
|
||||
print("Tables: \(tables.count)")
|
||||
for (index, _) in tables.enumerated() {
|
||||
print("Table \(index)")
|
||||
}
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for (index, _) in chunks.enumerated() {
|
||||
print("Chunk \(index)")
|
||||
}
|
||||
}
|
||||
```
|
||||
33
docs/snippets/swift/llm/structured_extraction.md
Normal file
33
docs/snippets/swift/llm/structured_extraction.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"structured_extraction": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": { "type": "string" },
|
||||
"authors": { "type": "array", "items": { "type": "string" } },
|
||||
"date": { "type": "string" }
|
||||
},
|
||||
"required": ["title", "authors", "date"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"llm": {
|
||||
"model": "openai/gpt-4o-mini"
|
||||
},
|
||||
"strict": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("paper.pdf", nil, config)
|
||||
|
||||
if let structured = result.structured_output() {
|
||||
print(structured.toString())
|
||||
}
|
||||
```
|
||||
39
docs/snippets/swift/mcp/mcp_custom_client.md
Normal file
39
docs/snippets/swift/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,39 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: "/usr/bin/env")
|
||||
process.arguments = ["kreuzberg", "mcp"]
|
||||
|
||||
let stdin = Pipe()
|
||||
let stdout = Pipe()
|
||||
process.standardInput = stdin
|
||||
process.standardOutput = stdout
|
||||
|
||||
try process.run()
|
||||
|
||||
let request: [String: Any] = [
|
||||
"method": "tools/call",
|
||||
"params": [
|
||||
"name": "extract_file",
|
||||
"arguments": [
|
||||
"path": "document.pdf",
|
||||
"async": true,
|
||||
],
|
||||
],
|
||||
]
|
||||
|
||||
let payload = try JSONSerialization.data(withJSONObject: request)
|
||||
stdin.fileHandleForWriting.write(payload)
|
||||
stdin.fileHandleForWriting.write("\n".data(using: .utf8)!)
|
||||
try stdin.fileHandleForWriting.close()
|
||||
|
||||
let data = stdout.fileHandleForReading.availableData
|
||||
if let line = String(data: data, encoding: .utf8) {
|
||||
print(line)
|
||||
}
|
||||
|
||||
process.waitUntilExit()
|
||||
```
|
||||
15
docs/snippets/swift/mcp/mcp_server_start.md
Normal file
15
docs/snippets/swift/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,15 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
|
||||
// Start the kreuzberg MCP server as a subprocess.
|
||||
// The Swift bindings do not expose an in-process MCP server; use the
|
||||
// kreuzberg CLI binary which provides the MCP transport over stdio.
|
||||
let process = Process()
|
||||
process.executableURL = URL(fileURLWithPath: "/usr/bin/env")
|
||||
process.arguments = ["kreuzberg", "mcp"]
|
||||
|
||||
try process.run()
|
||||
process.waitUntilExit()
|
||||
```
|
||||
25
docs/snippets/swift/metadata/language_detection.md
Normal file
25
docs/snippets/swift/metadata/language_detection.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
} else {
|
||||
print("No languages detected")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("multilingual_document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
43
docs/snippets/swift/metadata/metadata.md
Normal file
43
docs/snippets/swift/metadata/metadata.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let metadata = result.metadata()
|
||||
|
||||
if let title = metadata.title() {
|
||||
print("Title: \(title.toString())")
|
||||
}
|
||||
if let subject = metadata.subject() {
|
||||
print("Subject: \(subject.toString())")
|
||||
}
|
||||
if let language = metadata.language() {
|
||||
print("Language: \(language.toString())")
|
||||
}
|
||||
if let createdAt = metadata.created_at() {
|
||||
print("Created at: \(createdAt.toString())")
|
||||
}
|
||||
if let modifiedAt = metadata.modified_at() {
|
||||
print("Modified at: \(modifiedAt.toString())")
|
||||
}
|
||||
if let createdBy = metadata.created_by() {
|
||||
print("Created by: \(createdBy.toString())")
|
||||
}
|
||||
if let authors = metadata.authors() {
|
||||
let names = authors.map { $0.toString() }
|
||||
print("Authors: \(names)")
|
||||
}
|
||||
if let keywords = metadata.keywords() {
|
||||
let words = keywords.map { $0.toString() }
|
||||
print("Keywords: \(words)")
|
||||
}
|
||||
if let duration = metadata.extraction_duration_ms() {
|
||||
print("Extraction duration (ms): \(duration)")
|
||||
}
|
||||
if let pages = metadata.pages() {
|
||||
print("Page count: \(pages.total_count())")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/swift/metadata/page_boundaries.md
Normal file
35
docs/snippets/swift/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
let utf8 = Array(content.utf8)
|
||||
|
||||
guard let pageStructure = result.metadata().pages() else {
|
||||
print("No page structure available")
|
||||
exit(0)
|
||||
}
|
||||
guard let boundaries = pageStructure.boundaries() else {
|
||||
print("No page boundaries available")
|
||||
exit(0)
|
||||
}
|
||||
|
||||
for (index, boundary) in boundaries.enumerated() {
|
||||
if index >= 3 { break }
|
||||
|
||||
let byteStart = boundary.byte_start()
|
||||
let byteEnd = boundary.byte_end()
|
||||
let pageBytes = Array(utf8[byteStart..<byteEnd])
|
||||
let pageText = String(bytes: pageBytes, encoding: .utf8) ?? ""
|
||||
let previewEnd = min(100, pageText.count)
|
||||
let preview = String(pageText.prefix(previewEnd))
|
||||
|
||||
print("Page \(boundary.page_number()):")
|
||||
print(" Byte range: \(byteStart)-\(byteEnd)")
|
||||
print(" Preview: \(preview)...")
|
||||
}
|
||||
```
|
||||
28
docs/snippets/swift/metadata/page_tracking_basic.md
Normal file
28
docs/snippets/swift/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pages": {
|
||||
"extract_pages": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let pages = result.pages() {
|
||||
for page in pages {
|
||||
let pageContent = page.content().toString()
|
||||
print("Page \(page.page_number()):")
|
||||
print(" Content: \(pageContent.count) chars")
|
||||
print(" Tables: \(page.tables().count)")
|
||||
print(" Images: \(page.images().count)")
|
||||
}
|
||||
} else {
|
||||
print("No per-page content available")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/swift/metadata/tables.md
Normal file
20
docs/snippets/swift/metadata/tables.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let tables = result.tables()
|
||||
print("Tables: \(tables.count)")
|
||||
|
||||
for (index, table) in tables.enumerated() {
|
||||
print("Table \(index) on page \(table.page_number())")
|
||||
print(table.markdown().toString())
|
||||
|
||||
if let bbox = table.bounding_box() {
|
||||
print(" Bounding box: \(bbox.toString())")
|
||||
}
|
||||
}
|
||||
```
|
||||
55
docs/snippets/swift/metadata/vector_database_integration.md
Normal file
55
docs/snippets/swift/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
struct VectorRecord {
|
||||
let id: String
|
||||
let content: String
|
||||
let embedding: [Float]
|
||||
let metadata: [String: String]
|
||||
}
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 512,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 32,
|
||||
"normalize": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let documentId = "doc_001"
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
var records: [VectorRecord] = []
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for (index, chunk) in chunks.enumerated() {
|
||||
guard let embedding = chunk.embedding() else { continue }
|
||||
|
||||
let content = chunk.content().toString()
|
||||
let vector = embedding.map { $0 }
|
||||
|
||||
var metadata: [String: String] = [:]
|
||||
metadata["document_id"] = documentId
|
||||
metadata["chunk_index"] = String(index)
|
||||
metadata["content_length"] = String(content.count)
|
||||
|
||||
records.append(VectorRecord(
|
||||
id: "\(documentId)_chunk_\(index)",
|
||||
content: content,
|
||||
embedding: vector,
|
||||
metadata: metadata
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
print("Generated \(records.count) vector records")
|
||||
```
|
||||
22
docs/snippets/swift/ocr/cloud_ocr_backend.md
Normal file
22
docs/snippets/swift/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Custom/cloud OCR backends are registered via the Rust plugin system.
|
||||
// From Swift, select a registered custom backend by name through the
|
||||
// JSON configuration:
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "custom",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
18
docs/snippets/swift/ocr/image_extraction.md
Normal file
18
docs/snippets/swift/ocr/image_extraction.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"images": {
|
||||
"extract_images": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
20
docs/snippets/swift/ocr/image_preprocessing.md
Normal file
20
docs/snippets/swift/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"images": {
|
||||
"extract_images": true,
|
||||
"target_dpi": 300,
|
||||
"max_image_dimension": 2000
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
19
docs/snippets/swift/ocr/ocr_easyocr.md
Normal file
19
docs/snippets/swift/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "easyocr",
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
26
docs/snippets/swift/ocr/ocr_elements.md
Normal file
26
docs/snippets/swift/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "paddleocr",
|
||||
"language": "en",
|
||||
"element_config": {
|
||||
"include_elements": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
if let elements = result.ocr_elements() {
|
||||
for element in elements {
|
||||
print("Text: \(element.text().toString())")
|
||||
}
|
||||
}
|
||||
```
|
||||
19
docs/snippets/swift/ocr/ocr_extraction.md
Normal file
19
docs/snippets/swift/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
20
docs/snippets/swift/ocr/ocr_force_all_pages.md
Normal file
20
docs/snippets/swift/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"force_ocr": true,
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
19
docs/snippets/swift/ocr/ocr_multi_language.md
Normal file
19
docs/snippets/swift/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "tesseract",
|
||||
"language": "eng+deu+fra"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("multilingual.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
19
docs/snippets/swift/ocr/ocr_paddleocr.md
Normal file
19
docs/snippets/swift/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"ocr": {
|
||||
"backend": "paddleocr",
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print(result.content().toString())
|
||||
```
|
||||
13
docs/snippets/swift/plugins/clear_plugins.md
Normal file
13
docs/snippets/swift/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
// Clear all registered plugins in each registry
|
||||
try Kreuzberg.clearDocumentExtractors()
|
||||
try Kreuzberg.clearRenderers()
|
||||
try Kreuzberg.clearOcrBackends()
|
||||
try Kreuzberg.clearPostProcessors()
|
||||
try Kreuzberg.clearValidators()
|
||||
try Kreuzberg.clearEmbeddingBackends()
|
||||
|
||||
print("All plugins cleared")
|
||||
```
|
||||
63
docs/snippets/swift/plugins/embedding_backend.md
Normal file
63
docs/snippets/swift/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
// Wrap a custom embedder (e.g., CoreML, ONNX, API-based).
|
||||
// The Swift class must implement the EmbeddingBackend protocol.
|
||||
final class MyEmbedder: EmbeddingBackend {
|
||||
private let modelUrl: URL
|
||||
|
||||
init(modelUrl: URL) {
|
||||
self.modelUrl = modelUrl
|
||||
}
|
||||
|
||||
// Plugin trait hooks
|
||||
func name() -> String {
|
||||
"my-embedder"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func initialize() -> String { // Returns JSON-encoded Result
|
||||
do {
|
||||
// Warm-up logic here
|
||||
return "{\"ok\": null}"
|
||||
} catch {
|
||||
return "{\"err\": \"Failed to initialize: \(error)\"}"
|
||||
}
|
||||
}
|
||||
|
||||
func shutdown() -> String { // Returns JSON-encoded Result
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
// EmbeddingBackend hooks
|
||||
func dimensions() -> UInt {
|
||||
// Fixed dimensionality for this backend
|
||||
768
|
||||
}
|
||||
|
||||
func embed(texts: [String]) -> String { // Returns JSON-encoded Vec<Vec<f32>>
|
||||
do {
|
||||
// Embed texts using your backend (e.g., CoreML inference)
|
||||
let embeddings: [[Float]] = texts.map { _ in
|
||||
Array(repeating: 0.5, count: 768) // Placeholder
|
||||
}
|
||||
let data = try JSONEncoder().encode(embeddings)
|
||||
let json = String(data: data, encoding: .utf8) ?? "[]"
|
||||
return "{\"ok\": \(json)}"
|
||||
} catch {
|
||||
return "{\"err\": \"Embedding failed: \(error)\"}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Register once at startup
|
||||
let embedder = MyEmbedder(modelUrl: URL(fileURLWithPath: "/path/to/model"))
|
||||
try Kreuzberg.registerEmbeddingBackend(embedder)
|
||||
|
||||
print("Embedding backend 'my-embedder' registered")
|
||||
// The registered backend can now be referenced by name in EmbeddingConfig
|
||||
// via the plugin selection mechanism once alef supports it
|
||||
```
|
||||
14
docs/snippets/swift/plugins/extractor_registration.md
Normal file
14
docs/snippets/swift/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,14 @@
|
||||
<!-- snippet:skip reason="swift-bridge does not generate Swift-side protocol constructors for plugin registration. The Rust-side FFI defines SwiftDocumentExtractorBox as an opaque extern \"Swift\" type, but swift-bridge does not surface the protocol definition or constructor in the generated Swift package. Custom implementations must be written in Rust." -->
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
// Custom DocumentExtractor registration is not available from Swift.
|
||||
//
|
||||
// The Rust FFI defines SwiftDocumentExtractorBox as an opaque extern "Swift" type
|
||||
// (packages/swift/rust/src/lib.rs, lines 2710-2722), but the swift-bridge code
|
||||
// generator does not emit a Swift-side protocol definition or factory to construct
|
||||
// and register instances.
|
||||
//
|
||||
// Workaround: Implement DocumentExtractor in Rust and register via a Rust FFI shim,
|
||||
// or use the built-in extractors (PDF, DOCX, HTML, etc.) which are pre-registered.
|
||||
```
|
||||
17
docs/snippets/swift/plugins/list_plugins.md
Normal file
17
docs/snippets/swift/plugins/list_plugins.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
let extractors = try Kreuzberg.listDocumentExtractors()
|
||||
let renderers = try Kreuzberg.listRenderers()
|
||||
let processors = try Kreuzberg.listPostProcessors()
|
||||
let ocrBackends = try Kreuzberg.listOcrBackends()
|
||||
let validators = try Kreuzberg.listValidators()
|
||||
let embeddingBackends = try Kreuzberg.listEmbeddingBackends()
|
||||
|
||||
print("Extractors: \(extractors)")
|
||||
print("Renderers: \(renderers)")
|
||||
print("Processors: \(processors)")
|
||||
print("OCR backends: \(ocrBackends)")
|
||||
print("Validators: \(validators)")
|
||||
print("Embedding backends: \(embeddingBackends)")
|
||||
```
|
||||
48
docs/snippets/swift/plugins/min_length_validator.md
Normal file
48
docs/snippets/swift/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,48 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
final class MinLengthValidator: Validator {
|
||||
let minLength: Int
|
||||
|
||||
init(minLength: Int = 100) {
|
||||
self.minLength = minLength
|
||||
}
|
||||
|
||||
func name() -> String {
|
||||
"min_length_validator"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
100
|
||||
}
|
||||
|
||||
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
// Returns JSON-encoded Result<(), String>
|
||||
let contentLength = result.content().count
|
||||
if contentLength < minLength {
|
||||
let message = "Content too short: \(contentLength) < \(minLength)"
|
||||
return "{\"err\": \"\(message)\"}"
|
||||
}
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
true
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let validator = MinLengthValidator(minLength: 100)
|
||||
try Kreuzberg.registerValidator(validator)
|
||||
```
|
||||
13
docs/snippets/swift/plugins/pdf_metadata_extractor.md
Normal file
13
docs/snippets/swift/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,13 @@
|
||||
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
// Custom DocumentExtractor registration is not available from Swift.
|
||||
//
|
||||
// The FFI defines SwiftDocumentExtractorBox opaque type (packages/swift/rust/src/lib.rs),
|
||||
// but swift-bridge's Swift code generator does not emit the protocol definition or
|
||||
// factory required to construct and register instances from Swift.
|
||||
//
|
||||
// Workaround: Augment PDF extraction results by implementing a PostProcessor in Rust,
|
||||
// or post-process ExtractionResult.metadata in Swift after extraction.
|
||||
```
|
||||
46
docs/snippets/swift/plugins/pdf_only_processor.md
Normal file
46
docs/snippets/swift/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
final class PdfOnlyProcessor: PostProcessor {
|
||||
func name() -> String {
|
||||
"pdf-only-processor"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func processingStage() -> String {
|
||||
"middle" // ProcessingStage enum name
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
50 // Default priority
|
||||
}
|
||||
|
||||
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
// Returns JSON-encoded Result<(), String>
|
||||
// No-op post-processor for PDF-only processing
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
result.mimeType() == "application/pdf"
|
||||
}
|
||||
|
||||
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
|
||||
0 // No processing overhead
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let processor = PdfOnlyProcessor()
|
||||
try Kreuzberg.registerPostProcessor(processor)
|
||||
```
|
||||
13
docs/snippets/swift/plugins/plugin_extractor.md
Normal file
13
docs/snippets/swift/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,13 @@
|
||||
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
// Custom DocumentExtractor registration is not available from Swift.
|
||||
//
|
||||
// The Rust FFI (packages/swift/rust/src/lib.rs) accepts SwiftDocumentExtractorBox,
|
||||
// but swift-bridge does not generate the Swift-side protocol definition or
|
||||
// constructor required to implement and register instances.
|
||||
//
|
||||
// Solution: Implement DocumentExtractor in Rust and wrap it in a Rust FFI shim
|
||||
// that links both `kreuzberg` and the `kreuzberg-swift` package.
|
||||
```
|
||||
56
docs/snippets/swift/plugins/plugin_logging.md
Normal file
56
docs/snippets/swift/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,56 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
import os.log
|
||||
|
||||
let logger = Logger(subsystem: "com.example.plugins", category: "MyPlugin")
|
||||
|
||||
final class MyPlugin: PostProcessor {
|
||||
func name() -> String {
|
||||
"my-plugin"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
logger.info("Initializing plugin: my-plugin")
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
logger.info("Shutting down plugin: my-plugin")
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
let contentLen = result.content().count
|
||||
logger.info("Processing \(result.mimeType()) (\(contentLen) bytes)")
|
||||
|
||||
if contentLen == 0 {
|
||||
logger.warning("Processing resulted in empty content")
|
||||
}
|
||||
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
true
|
||||
}
|
||||
|
||||
func processingStage() -> String {
|
||||
"early"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
50
|
||||
}
|
||||
|
||||
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
|
||||
10
|
||||
}
|
||||
}
|
||||
|
||||
let plugin = MyPlugin()
|
||||
try Kreuzberg.registerPostProcessor(plugin)
|
||||
```
|
||||
46
docs/snippets/swift/plugins/plugin_testing.md
Normal file
46
docs/snippets/swift/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
import Testing
|
||||
|
||||
// Unit test a Swift Validator implementation
|
||||
final class MinLengthValidator: Validator {
|
||||
let minLength: Int
|
||||
|
||||
init(minLength: Int = 100) {
|
||||
self.minLength = minLength
|
||||
}
|
||||
|
||||
func name() -> String { "test-validator" }
|
||||
func version() -> String { "1.0.0" }
|
||||
func priority() -> Int32 { 50 }
|
||||
func initialize() -> String { "{\"ok\": null}" }
|
||||
func shutdown() -> String { "{\"ok\": null}" }
|
||||
|
||||
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
let contentLength = result.content().count
|
||||
if contentLength < minLength {
|
||||
return "{\"err\": \"Content too short: \(contentLength) < \(minLength)\"}"
|
||||
}
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
// Unit test the validator by directly testing its logic.
|
||||
// Integration tests exercise validators in-pipeline during extraction.
|
||||
|
||||
let validator = MinLengthValidator(minLength: 100)
|
||||
|
||||
// Create extraction config and result via the binding
|
||||
let configJson = "{\"use_cache\": false}"
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
|
||||
// Extract a document; the validator runs automatically during extraction
|
||||
let result = try extractFile(path: "test.txt", mimeType: "text/plain", config: config)
|
||||
|
||||
// The validator's validate() method is invoked in-pipeline.
|
||||
// If it rejects, the extraction throws an error.
|
||||
```
|
||||
55
docs/snippets/swift/plugins/plugin_validator.md
Normal file
55
docs/snippets/swift/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
final class MinLengthValidator: Validator {
|
||||
func name() -> String {
|
||||
"min_length"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
50
|
||||
}
|
||||
|
||||
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
let contentLength = result.content().count
|
||||
if contentLength < 50 {
|
||||
let message = "Content too short: \(contentLength)"
|
||||
return "{\"err\": \"\(message)\"}"
|
||||
}
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
true
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let validator = MinLengthValidator()
|
||||
try Kreuzberg.registerValidator(validator)
|
||||
|
||||
// Extract a file; the validator runs in-pipeline during extraction
|
||||
let config = ExtractionConfig(
|
||||
useCache: false,
|
||||
enableQualityProcessing: false,
|
||||
resultFormat: .unified,
|
||||
outputFormat: .markdown
|
||||
)
|
||||
let result = try extractFileSync(
|
||||
path: "document.pdf",
|
||||
mimeType: nil,
|
||||
config: config
|
||||
)
|
||||
print("Content length: \(result.content().count)")
|
||||
```
|
||||
54
docs/snippets/swift/plugins/quality_score_validator.md
Normal file
54
docs/snippets/swift/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
final class QualityValidator: Validator {
|
||||
let threshold: Double = 0.5
|
||||
|
||||
func name() -> String {
|
||||
"quality-validator"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
75
|
||||
}
|
||||
|
||||
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
// Parse metadata to extract quality score
|
||||
let metadata = result.metadata()
|
||||
let qualityScore: Double
|
||||
|
||||
if let scoreStr = metadata["quality_score"] as? String,
|
||||
let score = Double(scoreStr) {
|
||||
qualityScore = score
|
||||
} else {
|
||||
qualityScore = 0.0
|
||||
}
|
||||
|
||||
if qualityScore < threshold {
|
||||
let message = "Quality score too low: \(String(format: "%.2f", qualityScore))"
|
||||
return "{\"err\": \"\(message)\"}"
|
||||
}
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
// Only validate if quality processing was enabled
|
||||
config.enableQualityProcessing()
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let validator = QualityValidator()
|
||||
try Kreuzberg.registerValidator(validator)
|
||||
```
|
||||
66
docs/snippets/swift/plugins/stateful_plugin.md
Normal file
66
docs/snippets/swift/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,66 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
import os.lock
|
||||
|
||||
final class StatefulPlugin: PostProcessor {
|
||||
private var lock = NSLock()
|
||||
private var callCount: Int = 0
|
||||
private var cache: [String: String] = [:]
|
||||
|
||||
func name() -> String {
|
||||
"stateful-plugin"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func processingStage() -> String {
|
||||
"middle"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
50
|
||||
}
|
||||
|
||||
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
lock.lock()
|
||||
defer { lock.unlock() }
|
||||
|
||||
callCount += 1
|
||||
cache["last_mime"] = result.mimeType()
|
||||
cache["call_count"] = String(callCount)
|
||||
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
true
|
||||
}
|
||||
|
||||
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
|
||||
1 // Minimal overhead
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
lock.lock()
|
||||
defer { lock.unlock() }
|
||||
callCount = 0
|
||||
cache.removeAll()
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
lock.lock()
|
||||
defer { lock.unlock() }
|
||||
let finalCount = callCount
|
||||
cache.removeAll()
|
||||
let message = "Processed \(finalCount) extractions"
|
||||
print(message)
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let plugin = StatefulPlugin()
|
||||
try Kreuzberg.registerPostProcessor(plugin)
|
||||
```
|
||||
17
docs/snippets/swift/plugins/unregister_plugins.md
Normal file
17
docs/snippets/swift/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,17 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
let names = [
|
||||
"custom-json-extractor",
|
||||
"word_count",
|
||||
"cloud-ocr",
|
||||
"min_length_validator",
|
||||
]
|
||||
|
||||
try Kreuzberg.unregisterDocumentExtractor(names[0])
|
||||
try Kreuzberg.unregisterPostProcessor(names[1])
|
||||
try Kreuzberg.unregisterOcrBackend(names[2])
|
||||
try Kreuzberg.unregisterValidator(names[3])
|
||||
|
||||
print("Plugins unregistered")
|
||||
```
|
||||
49
docs/snippets/swift/plugins/word_count_processor.md
Normal file
49
docs/snippets/swift/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```swift title="Swift"
|
||||
import Kreuzberg
|
||||
|
||||
final class WordCountProcessor: PostProcessor {
|
||||
func name() -> String {
|
||||
"word_count"
|
||||
}
|
||||
|
||||
func version() -> String {
|
||||
"1.0.0"
|
||||
}
|
||||
|
||||
func processingStage() -> String {
|
||||
"early"
|
||||
}
|
||||
|
||||
func priority() -> Int32 {
|
||||
50
|
||||
}
|
||||
|
||||
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
|
||||
let content = result.content()
|
||||
let words = content.split(separator: " ").count
|
||||
|
||||
// Metadata is not directly mutable via the FFI, so store in logs or use
|
||||
// a side-channel approach. For now, just track that processing happened.
|
||||
return "{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
|
||||
!result.content().isEmpty
|
||||
}
|
||||
|
||||
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
|
||||
5
|
||||
}
|
||||
|
||||
func initialize() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
|
||||
func shutdown() -> String {
|
||||
"{\"ok\": null}"
|
||||
}
|
||||
}
|
||||
|
||||
let processor = WordCountProcessor()
|
||||
try Kreuzberg.registerPostProcessor(processor)
|
||||
```
|
||||
25
docs/snippets/swift/utils/chunking.md
Normal file
25
docs/snippets/swift/utils/chunking.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/swift/utils/chunking_rag.md
Normal file
35
docs/snippets/swift/utils/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
let content = chunk.content().toString()
|
||||
let preview = String(content.prefix(100))
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
print("Position: \(metadata.byte_start())-\(metadata.byte_end())")
|
||||
print("Content: \(preview)...")
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Embedding: \(embedding.count) dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
32
docs/snippets/swift/utils/embedding_with_chunking.md
Normal file
32
docs/snippets/swift/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1024,
|
||||
"overlap": 100,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32,
|
||||
"show_download_progress": false
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Generated \(chunks.count) chunks")
|
||||
for chunk in chunks {
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Chunk \(chunk.metadata().chunk_index()) -> \(embedding.count)-dim embedding")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/swift/utils/keyword_extraction_example.md
Normal file
26
docs/snippets/swift/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.3
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let keywords = result.extracted_keywords() {
|
||||
for keyword in keywords {
|
||||
let text = keyword.text().toString()
|
||||
let score = keyword.score()
|
||||
print("\(text) (score: \(score))")
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/swift/utils/quality_processing_example.md
Normal file
22
docs/snippets/swift/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned_document.pdf", nil, config)
|
||||
|
||||
if let score = result.quality_score() {
|
||||
if score < 0.5 {
|
||||
print(String(format: "Warning: Low quality extraction (%.2f)", score))
|
||||
} else {
|
||||
print(String(format: "Quality score: %.2f", score))
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/swift/utils/standalone_embed.md
Normal file
21
docs/snippets/swift/utils/standalone_embed.md
Normal file
@@ -0,0 +1,21 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
// Standalone embedding requires constructing an EmbeddingConfig directly.
|
||||
// The Swift bindings expose `embedTexts` / `embedTextsAsync`, but
|
||||
// EmbeddingConfig is an opaque proxy class — no JSON-config decoding is
|
||||
// available. Build it via the generated initializer or use chunking-time
|
||||
// embedding via `extractionConfigFromJson` (see embedding_with_chunking).
|
||||
let texts = RustVec<RustString>()
|
||||
texts.push(value: "Hello, world!".intoRustString())
|
||||
texts.push(value: "Kreuzberg is fast".intoRustString())
|
||||
|
||||
// `config` here is a fully-constructed EmbeddingConfig built via the
|
||||
// generated initializer in RustBridge.
|
||||
let embeddings = try embedTexts(texts, config)
|
||||
print(embeddings.toString())
|
||||
```
|
||||
21
docs/snippets/swift/utils/token_reduction.md
Normal file
21
docs/snippets/swift/utils/token_reduction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true,
|
||||
"preserve_code": true,
|
||||
"language_hint": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Reduced content length: \(result.content().toString().count)")
|
||||
```
|
||||
23
docs/snippets/swift/utils/token_reduction_example.md
Normal file
23
docs/snippets/swift/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("verbose_document.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
print("Reduced content length: \(content.count)")
|
||||
for warning in result.processing_warnings() {
|
||||
print("Warning [\(warning.source().toString())]: \(warning.message().toString())")
|
||||
}
|
||||
```
|
||||
51
docs/snippets/swift/utils/vector_database_integration.md
Normal file
51
docs/snippets/swift/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,51 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
struct VectorRecord {
|
||||
let id: String
|
||||
let content: String
|
||||
let embedding: [Float]
|
||||
let metadata: [String: String]
|
||||
}
|
||||
|
||||
func extractAndVectorize(documentPath: String, documentId: String) throws -> [VectorRecord] {
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 512,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync(documentPath, nil, config)
|
||||
|
||||
var records: [VectorRecord] = []
|
||||
if let chunks = result.chunks() {
|
||||
for (index, chunk) in chunks.enumerated() {
|
||||
guard let embedding = chunk.embedding() else { continue }
|
||||
let content = chunk.content().toString()
|
||||
let metadata: [String: String] = [
|
||||
"document_id": documentId,
|
||||
"chunk_index": String(index),
|
||||
"content_length": String(content.count),
|
||||
]
|
||||
records.append(VectorRecord(
|
||||
id: "\(documentId)_chunk_\(index)",
|
||||
content: content,
|
||||
embedding: embedding.map { $0 },
|
||||
metadata: metadata
|
||||
))
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user