Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 500,
"overlap": 50
},
"pages": {
"extract_pages": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
for chunk in chunks {
let metadata = chunk.metadata()
let content = chunk.content().toString()
let preview = String(content.prefix(50))
if let first = metadata.first_page(), let last = metadata.last_page() {
let pageRange = first == last ? "Page \(first)" : "Pages \(first)-\(last)"
print("Chunk: \(preview)... (\(pageRange))")
} else {
print("Chunk: \(preview)... (no page info)")
}
}
}
```

View File

@@ -0,0 +1,25 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1000,
"overlap": 200
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
print("Chunks: \(chunks.count)")
for chunk in chunks {
let metadata = chunk.metadata()
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
}
}
```

View File

@@ -0,0 +1,35 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 500,
"overlap": 50,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("research_paper.pdf", nil, config)
if let chunks = result.chunks() {
for chunk in chunks {
let metadata = chunk.metadata()
let content = chunk.content().toString()
let preview = String(content.prefix(100))
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
print("Position: \(metadata.byte_start())-\(metadata.byte_end())")
print("Content: \(preview)...")
if let embedding = chunk.embedding() {
print("Embedding: \(embedding.count) dimensions")
}
}
}
```

View File

@@ -0,0 +1,32 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1024,
"overlap": 100,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true,
"batch_size": 32,
"show_download_progress": false
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
print("Generated \(chunks.count) chunks")
for chunk in chunks {
if let embedding = chunk.embedding() {
print("Chunk \(chunk.metadata().chunk_index()) -> \(embedding.count)-dim embedding")
}
}
}
```

View File

@@ -0,0 +1,24 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"keywords": {
"algorithm": "yake",
"max_keywords": 10,
"min_score": 0.3,
"ngram_range": [1, 3],
"language": "en"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let keywords = result.extracted_keywords() {
print("Extracted \(keywords.count) keywords")
}
```

View File

@@ -0,0 +1,26 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"keywords": {
"algorithm": "yake",
"max_keywords": 10,
"min_score": 0.3
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("research_paper.pdf", nil, config)
if let keywords = result.extracted_keywords() {
for keyword in keywords {
let text = keyword.text().toString()
let score = keyword.score()
print("\(text) (score: \(score))")
}
}
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": false
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Detected: \(langs)")
}
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("multilingual_document.pdf", nil, config)
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Detected languages: \(langs)")
}
```

View File

@@ -0,0 +1,18 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"enable_quality_processing": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let score = result.quality_score() {
print(String(format: "Quality score: %.2f", score))
}
```

View File

@@ -0,0 +1,22 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"enable_quality_processing": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned_document.pdf", nil, config)
if let score = result.quality_score() {
if score < 0.5 {
print(String(format: "Warning: Low quality extraction (%.2f)", score))
} else {
print(String(format: "Quality score: %.2f", score))
}
}
```

View File

@@ -0,0 +1,21 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"token_reduction": {
"mode": "moderate",
"preserve_markdown": true,
"preserve_code": true,
"language_hint": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Reduced content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"token_reduction": {
"mode": "moderate",
"preserve_markdown": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("verbose_document.pdf", nil, config)
let content = result.content().toString()
print("Reduced content length: \(content.count)")
for warning in result.processing_warnings() {
print("Warning [\(warning.source().toString())]: \(warning.message().toString())")
}
```

View File

@@ -0,0 +1,51 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
struct VectorRecord {
let id: String
let content: String
let embedding: [Float]
let metadata: [String: String]
}
func extractAndVectorize(documentPath: String, documentId: String) throws -> [VectorRecord] {
let configJson = """
{
"chunking": {
"max_characters": 512,
"overlap": 50,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true,
"batch_size": 32
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync(documentPath, nil, config)
var records: [VectorRecord] = []
if let chunks = result.chunks() {
for (index, chunk) in chunks.enumerated() {
guard let embedding = chunk.embedding() else { continue }
let content = chunk.content().toString()
let metadata: [String: String] = [
"document_id": documentId,
"chunk_index": String(index),
"content_length": String(content.count),
]
records.append(VectorRecord(
id: "\(documentId)_chunk_\(index)",
content: content,
embedding: embedding.map { $0 },
metadata: metadata
))
}
}
return records
}
```

View File

@@ -0,0 +1,31 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// `BatchBytesItem` is an opaque swift-bridge class with no public Swift
// constructor — build items from JSON via `batchBytesItemFromJson`.
// `content` must be encoded as a JSON byte array.
func encodeBytesAsJsonArray(_ bytes: [UInt8]) -> String {
"[" + bytes.map { String($0) }.joined(separator: ",") + "]"
}
let items = RustVec<BatchBytesItem>()
let first = Array("Hello, world!".utf8)
items.push(value: try batchBytesItemFromJson(
"{\"content\": \(encodeBytesAsJsonArray(first)), \"mime_type\": \"text/plain\"}"
))
let second = Array("# Heading\n\nParagraph text.".utf8)
items.push(value: try batchBytesItemFromJson(
"{\"content\": \(encodeBytesAsJsonArray(second)), \"mime_type\": \"text/markdown\"}"
))
let config = try extractionConfigFromJson("{}")
let results = try batchExtractBytesSync(items, config)
for (index, result) in results.enumerated() {
print("Item \(index): \(result.content().toString().count) chars")
}
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// `BatchFileItem` is an opaque swift-bridge class with no public Swift
// constructor — build items from JSON via `batchFileItemFromJson`.
let items = RustVec<BatchFileItem>()
for path in ["doc1.pdf", "doc2.docx", "report.pdf"] {
let json = "{\"path\": \"\(path)\"}"
items.push(value: try batchFileItemFromJson(json))
}
let config = try extractionConfigFromJson("{}")
let results = try batchExtractFilesSync(items, config)
for (index, result) in results.enumerated() {
print("File \(index): \(result.content().toString().count) chars")
}
```

View File

@@ -0,0 +1,44 @@
```swift title="Swift"
import Foundation
#if canImport(FoundationNetworking)
import FoundationNetworking
#endif
@main
struct App {
static func main() async throws {
let payload: [String: Any] = [
"text": "Your long text content here...",
"chunker_type": "text",
"config": [
"max_characters": 1000,
"overlap": 50,
"trim": true,
],
]
var request = URLRequest(url: URL(string: "http://localhost:8000/chunk")!)
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.httpBody = try JSONSerialization.data(withJSONObject: payload)
let (data, response) = try await URLSession.shared.data(for: request)
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
throw NSError(domain: "kreuzberg", code: 1)
}
let result = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
let chunkCount = result["chunk_count"] as? Int ?? 0
print("Created \(chunkCount) chunks")
if let chunks = result["chunks"] as? [[String: Any]] {
for chunk in chunks {
let content = chunk["content"] as? String ?? ""
let index = chunk["chunk_index"] as? Int ?? 0
let preview = String(content.prefix(50))
print("Chunk \(index): \(preview)...")
}
}
}
}
```

View File

@@ -0,0 +1,42 @@
```swift title="Swift"
import Foundation
#if canImport(FoundationNetworking)
import FoundationNetworking
#endif
@main
struct App {
static func main() async throws {
let fileURL = URL(fileURLWithPath: "document.pdf")
let fileData = try Data(contentsOf: fileURL)
let fileName = fileURL.lastPathComponent
let boundary = "Boundary-\(UUID().uuidString)"
var request = URLRequest(url: URL(string: "http://localhost:8000/extract")!)
request.httpMethod = "POST"
request.setValue(
"multipart/form-data; boundary=\(boundary)",
forHTTPHeaderField: "Content-Type"
)
var body = Data()
body.append("--\(boundary)\r\n".data(using: .utf8)!)
body.append(
"Content-Disposition: form-data; name=\"file\"; filename=\"\(fileName)\"\r\n"
.data(using: .utf8)!
)
body.append("Content-Type: application/pdf\r\n\r\n".data(using: .utf8)!)
body.append(fileData)
body.append("\r\n--\(boundary)--\r\n".data(using: .utf8)!)
request.httpBody = body
let (data, response) = try await URLSession.shared.data(for: request)
guard let http = response as? HTTPURLResponse, (200..<300).contains(http.statusCode) else {
throw NSError(domain: "kreuzberg", code: 1)
}
let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] ?? [:]
print(json["content"] as? String ?? "")
}
}
```

View File

@@ -0,0 +1,53 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// Build a fully-featured `ExtractionConfig` via JSON. The opaque swift-bridge
// initializer takes 30+ positional parameters, so JSON is the ergonomic path
// for non-trivial configs.
let configJson = """
{
"use_cache": true,
"enable_quality_processing": true,
"ocr": {
"backend": "tesseract",
"language": "eng"
},
"force_ocr": false,
"chunking": {
"max_characters": 800,
"overlap": 100,
"chunker_type": "markdown",
"prepend_heading_context": true
},
"images": {
"extract_images": true
},
"output_format": "markdown",
"include_document_structure": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("report.pdf", nil, config)
let content = result.content().toString()
print("Content (\(content.count) chars):")
let preview = String(content.prefix(200))
print(preview)
if let chunks = result.chunks() {
print("\nChunks: \(chunks.count)")
}
print("Tables: \(result.tables().count)")
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Languages: \(langs)")
}
if let method = result.extraction_method() {
print("Extraction method: \(method)")
}
```

View File

@@ -0,0 +1,31 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// The Swift binding throws `RustString` (not `KreuzbergError`) for every
// failure surfaced from the Rust core. The string preserves the original
// error variant name and message (e.g. "UnsupportedFormat: ...",
// "MissingDependency: ...", "Parsing: ...") so callers can pattern-match
// on the prefix or simply print the message.
do {
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
} catch let error as RustString {
let message = error.toString()
if message.contains("UnsupportedFormat") {
print("Unsupported format: \(message)")
} else if message.contains("MissingDependency") {
print("Install the required dependency: \(message)")
} else if message.contains("Parsing") {
print("Corrupt or invalid document: \(message)")
} else if message.contains("Io") {
print("File error: \(message)")
} else {
print("Extraction failed: \(message)")
}
} catch {
print("Unexpected error: \(error)")
}
```

View File

@@ -0,0 +1,32 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
func extractText(bytes: [UInt8], mimeType: String) throws -> String {
let content = RustVec<UInt8>()
for byte in bytes { content.push(value: byte) }
let config = try extractionConfigFromJson("{}")
let result = try extractBytesSync(content, mimeType, config)
return result.content().toString()
}
let data = (try? Data(contentsOf: URL(fileURLWithPath: "document.pdf"))) ?? Data()
let bytes = Array(data)
do {
let text = try extractText(bytes: bytes, mimeType: "application/pdf")
print("Extracted \(text.count) chars")
} catch let error as RustString {
let message = error.toString()
if message.contains("UnsupportedFormat") {
print("Format not supported: \(message)")
} else if message.contains("Ocr") {
print("OCR failed: \(message)")
} else {
print("Error: \(message)")
}
} catch {
print("Unexpected error: \(error)")
}
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
@main
struct App {
static func main() async throws {
let data = try Data(contentsOf: URL(fileURLWithPath: "document.pdf"))
let content = RustVec<UInt8>()
for byte in data { content.push(value: byte) }
let config = try extractionConfigFromJson("{}")
let result = try await extractBytes(content, "application/pdf", config)
print(result.content().toString())
print("Tables: \(result.tables().count)")
}
}
```

View File

@@ -0,0 +1,15 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let data = try Data(contentsOf: URL(fileURLWithPath: "document.pdf"))
let content = RustVec<UInt8>()
for byte in data { content.push(value: byte) }
let config = try extractionConfigFromJson("{}")
let result = try extractBytesSync(content, "application/pdf", config)
print(result.content().toString())
print("Tables: \(result.tables().count)")
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
@main
struct App {
static func main() async throws {
let config = try extractionConfigFromJson("{}")
// The Swift binding exposes async-compatible entrypoints; even though
// the bridge calls are synchronous internally, callers may `await` them
// to integrate with Swift Concurrency.
let result = try await extractFile("document.pdf", nil, config)
print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
print("Tables: \(result.tables().count)")
}
}
```

View File

@@ -0,0 +1,12 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
print("Tables: \(result.tables().count)")
```

View File

@@ -0,0 +1,59 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// Build a fully-featured `ExtractionConfig` via JSON. ExtractionConfig has
// 30+ fields, so JSON is the ergonomic path for non-trivial configs.
let configJson = """
{
"use_cache": true,
"enable_quality_processing": true,
"ocr": {
"backend": "tesseract",
"language": "eng"
},
"chunking": {
"max_characters": 1000,
"overlap": 200,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"batch_size": 32,
"normalize": true,
"show_download_progress": false
}
},
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": false
},
"keywords": {
"algorithm": "yake",
"max_keywords": 10,
"min_score": 0.1,
"ngram_range": [1, 3],
"language": "en"
},
"token_reduction": {
"mode": "moderate",
"preserve_important_words": true
},
"postprocessor": {
"enabled": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Content: \(result.content().toString())")
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Languages: \(langs)")
}
if let chunks = result.chunks() {
print("Chunks: \(chunks.count)")
}
```

View File

@@ -0,0 +1,27 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1000,
"overlap": 100,
"chunker_type": "markdown",
"prepend_heading_context": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.md", nil, config)
if let chunks = result.chunks() {
print("Chunks: \(chunks.count)")
for chunk in chunks {
let content = chunk.content().toString()
print("Length: \(content.count)")
}
}
```

View File

@@ -0,0 +1,17 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"use_cache": true,
"enable_quality_processing": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,21 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// Swift bindings build configs via JSON. To honor an on-disk
// `kreuzberg.{toml,yaml,json}`, load the file and pass its JSON
// representation to `extractionConfigFromJson`. Unknown formats
// can be normalized to JSON on the caller side.
let configJson: String
if let data = try? Data(contentsOf: URL(fileURLWithPath: "kreuzberg.json")),
let text = String(data: data, encoding: .utf8) {
configJson = text
} else {
configJson = "{}"
}
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
print("Tables detected: \(result.tables().count)")
```

View File

@@ -0,0 +1,28 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"use_cache": true,
"enable_quality_processing": true,
"ocr": {
"backend": "tesseract",
"language": "eng+deu",
"tesseract_config": {
"psm": 6
}
},
"chunking": {
"max_characters": 1000,
"overlap": 200
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,18 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"include_document_structure": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let document = result.document() {
print("Document nodes: \(document.nodes().count)")
}
```

View File

@@ -0,0 +1,22 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"result_format": "element_based"
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let elements = result.elements() {
print("Elements: \(elements.count)")
for element in elements {
print("Type: \(element.element_type().toString())")
print("Text: \(element.text().toString().prefix(100))")
}
}
```

View File

@@ -0,0 +1,27 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1000,
"overlap": 200,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"batch_size": 16,
"normalize": true,
"show_download_progress": true
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
print("Chunks with embeddings: \(chunks.count)")
}
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"output_format": "html",
"html_output": {
"theme": "github"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString()) // HTML with kb-* classes
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"keywords": {
"algorithm": "yake",
"max_keywords": 10,
"min_score": 0.1,
"ngram_range": [1, 3],
"language": "en"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Keywords extracted from document")
print("Content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Detected languages: \(langs)")
}
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"images": {
"extract_images": true,
"target_dpi": 300,
"max_image_dimension": 4096,
"auto_adjust_dpi": true,
"min_dpi": 150,
"max_dpi": 600
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"pdf_options": {
"extract_images": true,
"passwords": ["password123"],
"extract_metadata": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("encrypted.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,24 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"pdf_options": {
"hierarchy": {
"enabled": true,
"detection_threshold": 0.75,
"ocr_coverage_threshold": 0.8,
"min_level": 1,
"max_level": 5
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"postprocessor": {
"enabled": true,
"enabled_processors": ["whitespace_normalizer", "unicode_normalizer"]
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Processed content: \(result.content().toString())")
```

View File

@@ -0,0 +1,18 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"enable_quality_processing": true,
"use_cache": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Content length: \(result.content().toString().count)")
print("Tables: \(result.tables().count)")
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "tesseract",
"language": "eng+deu",
"tesseract_config": {
"psm": 6,
"oem": 3
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
print("OCR text: \(result.content().toString())")
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"token_reduction": {
"mode": "moderate",
"preserve_important_words": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Reduced content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,11 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
```

View File

@@ -0,0 +1,12 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print("Content: \(result.content().toString())")
print("MIME type: \(result.mime_type().toString())")
print("Tables: \(result.tables().count)")
```

View File

@@ -0,0 +1,21 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"force_ocr": true,
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
```

View File

@@ -0,0 +1,12 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
print("Hello")
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print("MIME type: \(result.mime_type().toString())")
```

View File

@@ -0,0 +1,9 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
print("Kreuzberg Swift binding loaded successfully")
print("Default config built: \(config)")
```

View File

@@ -0,0 +1,31 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 800,
"overlap": 100,
"chunker_type": "markdown"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
let tables = result.tables()
print("Tables: \(tables.count)")
for (index, _) in tables.enumerated() {
print("Table \(index)")
}
if let chunks = result.chunks() {
print("Chunks: \(chunks.count)")
for (index, _) in chunks.enumerated() {
print("Chunk \(index)")
}
}
```

View File

@@ -0,0 +1,33 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"structured_extraction": {
"schema": {
"type": "object",
"properties": {
"title": { "type": "string" },
"authors": { "type": "array", "items": { "type": "string" } },
"date": { "type": "string" }
},
"required": ["title", "authors", "date"],
"additionalProperties": false
},
"llm": {
"model": "openai/gpt-4o-mini"
},
"strict": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("paper.pdf", nil, config)
if let structured = result.structured_output() {
print(structured.toString())
}
```

View File

@@ -0,0 +1,39 @@
<!-- snippet:syntax-only -->
```swift title="Swift"
import Foundation
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/bin/env")
process.arguments = ["kreuzberg", "mcp"]
let stdin = Pipe()
let stdout = Pipe()
process.standardInput = stdin
process.standardOutput = stdout
try process.run()
let request: [String: Any] = [
"method": "tools/call",
"params": [
"name": "extract_file",
"arguments": [
"path": "document.pdf",
"async": true,
],
],
]
let payload = try JSONSerialization.data(withJSONObject: request)
stdin.fileHandleForWriting.write(payload)
stdin.fileHandleForWriting.write("\n".data(using: .utf8)!)
try stdin.fileHandleForWriting.close()
let data = stdout.fileHandleForReading.availableData
if let line = String(data: data, encoding: .utf8) {
print(line)
}
process.waitUntilExit()
```

View File

@@ -0,0 +1,15 @@
<!-- snippet:syntax-only -->
```swift title="Swift"
import Foundation
// Start the kreuzberg MCP server as a subprocess.
// The Swift bindings do not expose an in-process MCP server; use the
// kreuzberg CLI binary which provides the MCP transport over stdio.
let process = Process()
process.executableURL = URL(fileURLWithPath: "/usr/bin/env")
process.arguments = ["kreuzberg", "mcp"]
try process.run()
process.waitUntilExit()
```

View File

@@ -0,0 +1,25 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": false
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Detected languages: \(langs)")
} else {
print("No languages detected")
}
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"language_detection": {
"enabled": true,
"min_confidence": 0.8,
"detect_multiple": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("multilingual_document.pdf", nil, config)
if let languages = result.detected_languages() {
let langs = languages.map { $0.toString() }
print("Detected languages: \(langs)")
}
```

View File

@@ -0,0 +1,43 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
let metadata = result.metadata()
if let title = metadata.title() {
print("Title: \(title.toString())")
}
if let subject = metadata.subject() {
print("Subject: \(subject.toString())")
}
if let language = metadata.language() {
print("Language: \(language.toString())")
}
if let createdAt = metadata.created_at() {
print("Created at: \(createdAt.toString())")
}
if let modifiedAt = metadata.modified_at() {
print("Modified at: \(modifiedAt.toString())")
}
if let createdBy = metadata.created_by() {
print("Created by: \(createdBy.toString())")
}
if let authors = metadata.authors() {
let names = authors.map { $0.toString() }
print("Authors: \(names)")
}
if let keywords = metadata.keywords() {
let words = keywords.map { $0.toString() }
print("Keywords: \(words)")
}
if let duration = metadata.extraction_duration_ms() {
print("Extraction duration (ms): \(duration)")
}
if let pages = metadata.pages() {
print("Page count: \(pages.total_count())")
}
```

View File

@@ -0,0 +1,35 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
let content = result.content().toString()
let utf8 = Array(content.utf8)
guard let pageStructure = result.metadata().pages() else {
print("No page structure available")
exit(0)
}
guard let boundaries = pageStructure.boundaries() else {
print("No page boundaries available")
exit(0)
}
for (index, boundary) in boundaries.enumerated() {
if index >= 3 { break }
let byteStart = boundary.byte_start()
let byteEnd = boundary.byte_end()
let pageBytes = Array(utf8[byteStart..<byteEnd])
let pageText = String(bytes: pageBytes, encoding: .utf8) ?? ""
let previewEnd = min(100, pageText.count)
let preview = String(pageText.prefix(previewEnd))
print("Page \(boundary.page_number()):")
print(" Byte range: \(byteStart)-\(byteEnd)")
print(" Preview: \(preview)...")
}
```

View File

@@ -0,0 +1,28 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"pages": {
"extract_pages": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let pages = result.pages() {
for page in pages {
let pageContent = page.content().toString()
print("Page \(page.page_number()):")
print(" Content: \(pageContent.count) chars")
print(" Tables: \(page.tables().count)")
print(" Images: \(page.images().count)")
}
} else {
print("No per-page content available")
}
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
let tables = result.tables()
print("Tables: \(tables.count)")
for (index, table) in tables.enumerated() {
print("Table \(index) on page \(table.page_number())")
print(table.markdown().toString())
if let bbox = table.bounding_box() {
print(" Bounding box: \(bbox.toString())")
}
}
```

View File

@@ -0,0 +1,55 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
struct VectorRecord {
let id: String
let content: String
let embedding: [Float]
let metadata: [String: String]
}
let configJson = """
{
"chunking": {
"max_characters": 512,
"overlap": 50,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"batch_size": 32,
"normalize": true
}
}
}
"""
let documentId = "doc_001"
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
var records: [VectorRecord] = []
if let chunks = result.chunks() {
for (index, chunk) in chunks.enumerated() {
guard let embedding = chunk.embedding() else { continue }
let content = chunk.content().toString()
let vector = embedding.map { $0 }
var metadata: [String: String] = [:]
metadata["document_id"] = documentId
metadata["chunk_index"] = String(index)
metadata["content_length"] = String(content.count)
records.append(VectorRecord(
id: "\(documentId)_chunk_\(index)",
content: content,
embedding: vector,
metadata: metadata
))
}
}
print("Generated \(records.count) vector records")
```

View File

@@ -0,0 +1,22 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// Custom/cloud OCR backends are registered via the Rust plugin system.
// From Swift, select a registered custom backend by name through the
// JSON configuration:
let configJson = """
{
"ocr": {
"backend": "custom",
"language": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,18 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"images": {
"extract_images": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"images": {
"extract_images": true,
"target_dpi": 300,
"max_image_dimension": 2000
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "easyocr",
"language": "en"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,26 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "paddleocr",
"language": "en",
"element_config": {
"include_elements": true
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
if let elements = result.ocr_elements() {
for element in elements {
print("Text: \(element.text().toString())")
}
}
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,20 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"force_ocr": true,
"ocr": {
"backend": "tesseract",
"language": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "tesseract",
"language": "eng+deu+fra"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("multilingual.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,19 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"ocr": {
"backend": "paddleocr",
"language": "en"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
```

View File

@@ -0,0 +1,13 @@
```swift title="Swift"
import Kreuzberg
// Clear all registered plugins in each registry
try Kreuzberg.clearDocumentExtractors()
try Kreuzberg.clearRenderers()
try Kreuzberg.clearOcrBackends()
try Kreuzberg.clearPostProcessors()
try Kreuzberg.clearValidators()
try Kreuzberg.clearEmbeddingBackends()
print("All plugins cleared")
```

View File

@@ -0,0 +1,63 @@
```swift title="Swift"
import Kreuzberg
// Wrap a custom embedder (e.g., CoreML, ONNX, API-based).
// The Swift class must implement the EmbeddingBackend protocol.
final class MyEmbedder: EmbeddingBackend {
private let modelUrl: URL
init(modelUrl: URL) {
self.modelUrl = modelUrl
}
// Plugin trait hooks
func name() -> String {
"my-embedder"
}
func version() -> String {
"1.0.0"
}
func initialize() -> String { // Returns JSON-encoded Result
do {
// Warm-up logic here
return "{\"ok\": null}"
} catch {
return "{\"err\": \"Failed to initialize: \(error)\"}"
}
}
func shutdown() -> String { // Returns JSON-encoded Result
"{\"ok\": null}"
}
// EmbeddingBackend hooks
func dimensions() -> UInt {
// Fixed dimensionality for this backend
768
}
func embed(texts: [String]) -> String { // Returns JSON-encoded Vec<Vec<f32>>
do {
// Embed texts using your backend (e.g., CoreML inference)
let embeddings: [[Float]] = texts.map { _ in
Array(repeating: 0.5, count: 768) // Placeholder
}
let data = try JSONEncoder().encode(embeddings)
let json = String(data: data, encoding: .utf8) ?? "[]"
return "{\"ok\": \(json)}"
} catch {
return "{\"err\": \"Embedding failed: \(error)\"}"
}
}
}
// Register once at startup
let embedder = MyEmbedder(modelUrl: URL(fileURLWithPath: "/path/to/model"))
try Kreuzberg.registerEmbeddingBackend(embedder)
print("Embedding backend 'my-embedder' registered")
// The registered backend can now be referenced by name in EmbeddingConfig
// via the plugin selection mechanism once alef supports it
```

View File

@@ -0,0 +1,14 @@
<!-- snippet:skip reason="swift-bridge does not generate Swift-side protocol constructors for plugin registration. The Rust-side FFI defines SwiftDocumentExtractorBox as an opaque extern \"Swift\" type, but swift-bridge does not surface the protocol definition or constructor in the generated Swift package. Custom implementations must be written in Rust." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The Rust FFI defines SwiftDocumentExtractorBox as an opaque extern "Swift" type
// (packages/swift/rust/src/lib.rs, lines 2710-2722), but the swift-bridge code
// generator does not emit a Swift-side protocol definition or factory to construct
// and register instances.
//
// Workaround: Implement DocumentExtractor in Rust and register via a Rust FFI shim,
// or use the built-in extractors (PDF, DOCX, HTML, etc.) which are pre-registered.
```

View File

@@ -0,0 +1,17 @@
```swift title="Swift"
import Kreuzberg
let extractors = try Kreuzberg.listDocumentExtractors()
let renderers = try Kreuzberg.listRenderers()
let processors = try Kreuzberg.listPostProcessors()
let ocrBackends = try Kreuzberg.listOcrBackends()
let validators = try Kreuzberg.listValidators()
let embeddingBackends = try Kreuzberg.listEmbeddingBackends()
print("Extractors: \(extractors)")
print("Renderers: \(renderers)")
print("Processors: \(processors)")
print("OCR backends: \(ocrBackends)")
print("Validators: \(validators)")
print("Embedding backends: \(embeddingBackends)")
```

View File

@@ -0,0 +1,48 @@
```swift title="Swift"
import Kreuzberg
final class MinLengthValidator: Validator {
let minLength: Int
init(minLength: Int = 100) {
self.minLength = minLength
}
func name() -> String {
"min_length_validator"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
100
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
// Returns JSON-encoded Result<(), String>
let contentLength = result.content().count
if contentLength < minLength {
let message = "Content too short: \(contentLength) < \(minLength)"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = MinLengthValidator(minLength: 100)
try Kreuzberg.registerValidator(validator)
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The FFI defines SwiftDocumentExtractorBox opaque type (packages/swift/rust/src/lib.rs),
// but swift-bridge's Swift code generator does not emit the protocol definition or
// factory required to construct and register instances from Swift.
//
// Workaround: Augment PDF extraction results by implementing a PostProcessor in Rust,
// or post-process ExtractionResult.metadata in Swift after extraction.
```

View File

@@ -0,0 +1,46 @@
```swift title="Swift"
import Kreuzberg
final class PdfOnlyProcessor: PostProcessor {
func name() -> String {
"pdf-only-processor"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"middle" // ProcessingStage enum name
}
func priority() -> Int32 {
50 // Default priority
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
// Returns JSON-encoded Result<(), String>
// No-op post-processor for PDF-only processing
"{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
result.mimeType() == "application/pdf"
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
0 // No processing overhead
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let processor = PdfOnlyProcessor()
try Kreuzberg.registerPostProcessor(processor)
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The Rust FFI (packages/swift/rust/src/lib.rs) accepts SwiftDocumentExtractorBox,
// but swift-bridge does not generate the Swift-side protocol definition or
// constructor required to implement and register instances.
//
// Solution: Implement DocumentExtractor in Rust and wrap it in a Rust FFI shim
// that links both `kreuzberg` and the `kreuzberg-swift` package.
```

View File

@@ -0,0 +1,56 @@
```swift title="Swift"
import Kreuzberg
import os.log
let logger = Logger(subsystem: "com.example.plugins", category: "MyPlugin")
final class MyPlugin: PostProcessor {
func name() -> String {
"my-plugin"
}
func version() -> String {
"1.0.0"
}
func initialize() -> String {
logger.info("Initializing plugin: my-plugin")
return "{\"ok\": null}"
}
func shutdown() -> String {
logger.info("Shutting down plugin: my-plugin")
return "{\"ok\": null}"
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLen = result.content().count
logger.info("Processing \(result.mimeType()) (\(contentLen) bytes)")
if contentLen == 0 {
logger.warning("Processing resulted in empty content")
}
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func processingStage() -> String {
"early"
}
func priority() -> Int32 {
50
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
10
}
}
let plugin = MyPlugin()
try Kreuzberg.registerPostProcessor(plugin)
```

View File

@@ -0,0 +1,46 @@
```swift title="Swift"
import Kreuzberg
import Testing
// Unit test a Swift Validator implementation
final class MinLengthValidator: Validator {
let minLength: Int
init(minLength: Int = 100) {
self.minLength = minLength
}
func name() -> String { "test-validator" }
func version() -> String { "1.0.0" }
func priority() -> Int32 { 50 }
func initialize() -> String { "{\"ok\": null}" }
func shutdown() -> String { "{\"ok\": null}" }
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLength = result.content().count
if contentLength < minLength {
return "{\"err\": \"Content too short: \(contentLength) < \(minLength)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
}
// Unit test the validator by directly testing its logic.
// Integration tests exercise validators in-pipeline during extraction.
let validator = MinLengthValidator(minLength: 100)
// Create extraction config and result via the binding
let configJson = "{\"use_cache\": false}"
let config = try extractionConfigFromJson(configJson)
// Extract a document; the validator runs automatically during extraction
let result = try extractFile(path: "test.txt", mimeType: "text/plain", config: config)
// The validator's validate() method is invoked in-pipeline.
// If it rejects, the extraction throws an error.
```

View File

@@ -0,0 +1,55 @@
```swift title="Swift"
import Kreuzberg
final class MinLengthValidator: Validator {
func name() -> String {
"min_length"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
50
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLength = result.content().count
if contentLength < 50 {
let message = "Content too short: \(contentLength)"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = MinLengthValidator()
try Kreuzberg.registerValidator(validator)
// Extract a file; the validator runs in-pipeline during extraction
let config = ExtractionConfig(
useCache: false,
enableQualityProcessing: false,
resultFormat: .unified,
outputFormat: .markdown
)
let result = try extractFileSync(
path: "document.pdf",
mimeType: nil,
config: config
)
print("Content length: \(result.content().count)")
```

View File

@@ -0,0 +1,54 @@
```swift title="Swift"
import Kreuzberg
final class QualityValidator: Validator {
let threshold: Double = 0.5
func name() -> String {
"quality-validator"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
75
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
// Parse metadata to extract quality score
let metadata = result.metadata()
let qualityScore: Double
if let scoreStr = metadata["quality_score"] as? String,
let score = Double(scoreStr) {
qualityScore = score
} else {
qualityScore = 0.0
}
if qualityScore < threshold {
let message = "Quality score too low: \(String(format: "%.2f", qualityScore))"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
// Only validate if quality processing was enabled
config.enableQualityProcessing()
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = QualityValidator()
try Kreuzberg.registerValidator(validator)
```

View File

@@ -0,0 +1,66 @@
```swift title="Swift"
import Kreuzberg
import os.lock
final class StatefulPlugin: PostProcessor {
private var lock = NSLock()
private var callCount: Int = 0
private var cache: [String: String] = [:]
func name() -> String {
"stateful-plugin"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"middle"
}
func priority() -> Int32 {
50
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
lock.lock()
defer { lock.unlock() }
callCount += 1
cache["last_mime"] = result.mimeType()
cache["call_count"] = String(callCount)
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
1 // Minimal overhead
}
func initialize() -> String {
lock.lock()
defer { lock.unlock() }
callCount = 0
cache.removeAll()
return "{\"ok\": null}"
}
func shutdown() -> String {
lock.lock()
defer { lock.unlock() }
let finalCount = callCount
cache.removeAll()
let message = "Processed \(finalCount) extractions"
print(message)
return "{\"ok\": null}"
}
}
let plugin = StatefulPlugin()
try Kreuzberg.registerPostProcessor(plugin)
```

View File

@@ -0,0 +1,17 @@
```swift title="Swift"
import Kreuzberg
let names = [
"custom-json-extractor",
"word_count",
"cloud-ocr",
"min_length_validator",
]
try Kreuzberg.unregisterDocumentExtractor(names[0])
try Kreuzberg.unregisterPostProcessor(names[1])
try Kreuzberg.unregisterOcrBackend(names[2])
try Kreuzberg.unregisterValidator(names[3])
print("Plugins unregistered")
```

View File

@@ -0,0 +1,49 @@
```swift title="Swift"
import Kreuzberg
final class WordCountProcessor: PostProcessor {
func name() -> String {
"word_count"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"early"
}
func priority() -> Int32 {
50
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
let content = result.content()
let words = content.split(separator: " ").count
// Metadata is not directly mutable via the FFI, so store in logs or use
// a side-channel approach. For now, just track that processing happened.
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
!result.content().isEmpty
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
5
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let processor = WordCountProcessor()
try Kreuzberg.registerPostProcessor(processor)
```

View File

@@ -0,0 +1,25 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1000,
"overlap": 200
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
print("Chunks: \(chunks.count)")
for chunk in chunks {
let metadata = chunk.metadata()
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
}
}
```

View File

@@ -0,0 +1,35 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 500,
"overlap": 50,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("research_paper.pdf", nil, config)
if let chunks = result.chunks() {
for chunk in chunks {
let metadata = chunk.metadata()
let content = chunk.content().toString()
let preview = String(content.prefix(100))
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
print("Position: \(metadata.byte_start())-\(metadata.byte_end())")
print("Content: \(preview)...")
if let embedding = chunk.embedding() {
print("Embedding: \(embedding.count) dimensions")
}
}
}
```

View File

@@ -0,0 +1,32 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"chunking": {
"max_characters": 1024,
"overlap": 100,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true,
"batch_size": 32,
"show_download_progress": false
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
if let chunks = result.chunks() {
print("Generated \(chunks.count) chunks")
for chunk in chunks {
if let embedding = chunk.embedding() {
print("Chunk \(chunk.metadata().chunk_index()) -> \(embedding.count)-dim embedding")
}
}
}
```

View File

@@ -0,0 +1,26 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"keywords": {
"algorithm": "yake",
"max_keywords": 10,
"min_score": 0.3
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("research_paper.pdf", nil, config)
if let keywords = result.extracted_keywords() {
for keyword in keywords {
let text = keyword.text().toString()
let score = keyword.score()
print("\(text) (score: \(score))")
}
}
```

View File

@@ -0,0 +1,22 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"enable_quality_processing": true
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned_document.pdf", nil, config)
if let score = result.quality_score() {
if score < 0.5 {
print(String(format: "Warning: Low quality extraction (%.2f)", score))
} else {
print(String(format: "Quality score: %.2f", score))
}
}
```

View File

@@ -0,0 +1,21 @@
<!-- snippet:syntax-only -->
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
// Standalone embedding requires constructing an EmbeddingConfig directly.
// The Swift bindings expose `embedTexts` / `embedTextsAsync`, but
// EmbeddingConfig is an opaque proxy class — no JSON-config decoding is
// available. Build it via the generated initializer or use chunking-time
// embedding via `extractionConfigFromJson` (see embedding_with_chunking).
let texts = RustVec<RustString>()
texts.push(value: "Hello, world!".intoRustString())
texts.push(value: "Kreuzberg is fast".intoRustString())
// `config` here is a fully-constructed EmbeddingConfig built via the
// generated initializer in RustBridge.
let embeddings = try embedTexts(texts, config)
print(embeddings.toString())
```

View File

@@ -0,0 +1,21 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"token_reduction": {
"mode": "moderate",
"preserve_markdown": true,
"preserve_code": true,
"language_hint": "eng"
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("document.pdf", nil, config)
print("Reduced content length: \(result.content().toString().count)")
```

View File

@@ -0,0 +1,23 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
let configJson = """
{
"token_reduction": {
"mode": "moderate",
"preserve_markdown": true
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("verbose_document.pdf", nil, config)
let content = result.content().toString()
print("Reduced content length: \(content.count)")
for warning in result.processing_warnings() {
print("Warning [\(warning.source().toString())]: \(warning.message().toString())")
}
```

View File

@@ -0,0 +1,51 @@
```swift title="Swift"
import Foundation
import Kreuzberg
import RustBridge
struct VectorRecord {
let id: String
let content: String
let embedding: [Float]
let metadata: [String: String]
}
func extractAndVectorize(documentPath: String, documentId: String) throws -> [VectorRecord] {
let configJson = """
{
"chunking": {
"max_characters": 512,
"overlap": 50,
"embedding": {
"model": {"preset": {"name": "balanced"}},
"normalize": true,
"batch_size": 32
}
}
}
"""
let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync(documentPath, nil, config)
var records: [VectorRecord] = []
if let chunks = result.chunks() {
for (index, chunk) in chunks.enumerated() {
guard let embedding = chunk.embedding() else { continue }
let content = chunk.content().toString()
let metadata: [String: String] = [
"document_id": documentId,
"chunk_index": String(index),
"content_length": String(content.count),
]
records.append(VectorRecord(
id: "\(documentId)_chunk_\(index)",
content: content,
embedding: embedding.map { $0 },
metadata: metadata
))
}
}
return records
}
```