Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
```swift title="Swift"
import Kreuzberg
// Clear all registered plugins in each registry
try Kreuzberg.clearDocumentExtractors()
try Kreuzberg.clearRenderers()
try Kreuzberg.clearOcrBackends()
try Kreuzberg.clearPostProcessors()
try Kreuzberg.clearValidators()
try Kreuzberg.clearEmbeddingBackends()
print("All plugins cleared")
```

View File

@@ -0,0 +1,63 @@
```swift title="Swift"
import Kreuzberg
// Wrap a custom embedder (e.g., CoreML, ONNX, API-based).
// The Swift class must implement the EmbeddingBackend protocol.
final class MyEmbedder: EmbeddingBackend {
private let modelUrl: URL
init(modelUrl: URL) {
self.modelUrl = modelUrl
}
// Plugin trait hooks
func name() -> String {
"my-embedder"
}
func version() -> String {
"1.0.0"
}
func initialize() -> String { // Returns JSON-encoded Result
do {
// Warm-up logic here
return "{\"ok\": null}"
} catch {
return "{\"err\": \"Failed to initialize: \(error)\"}"
}
}
func shutdown() -> String { // Returns JSON-encoded Result
"{\"ok\": null}"
}
// EmbeddingBackend hooks
func dimensions() -> UInt {
// Fixed dimensionality for this backend
768
}
func embed(texts: [String]) -> String { // Returns JSON-encoded Vec<Vec<f32>>
do {
// Embed texts using your backend (e.g., CoreML inference)
let embeddings: [[Float]] = texts.map { _ in
Array(repeating: 0.5, count: 768) // Placeholder
}
let data = try JSONEncoder().encode(embeddings)
let json = String(data: data, encoding: .utf8) ?? "[]"
return "{\"ok\": \(json)}"
} catch {
return "{\"err\": \"Embedding failed: \(error)\"}"
}
}
}
// Register once at startup
let embedder = MyEmbedder(modelUrl: URL(fileURLWithPath: "/path/to/model"))
try Kreuzberg.registerEmbeddingBackend(embedder)
print("Embedding backend 'my-embedder' registered")
// The registered backend can now be referenced by name in EmbeddingConfig
// via the plugin selection mechanism once alef supports it
```

View File

@@ -0,0 +1,14 @@
<!-- snippet:skip reason="swift-bridge does not generate Swift-side protocol constructors for plugin registration. The Rust-side FFI defines SwiftDocumentExtractorBox as an opaque extern \"Swift\" type, but swift-bridge does not surface the protocol definition or constructor in the generated Swift package. Custom implementations must be written in Rust." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The Rust FFI defines SwiftDocumentExtractorBox as an opaque extern "Swift" type
// (packages/swift/rust/src/lib.rs, lines 2710-2722), but the swift-bridge code
// generator does not emit a Swift-side protocol definition or factory to construct
// and register instances.
//
// Workaround: Implement DocumentExtractor in Rust and register via a Rust FFI shim,
// or use the built-in extractors (PDF, DOCX, HTML, etc.) which are pre-registered.
```

View File

@@ -0,0 +1,17 @@
```swift title="Swift"
import Kreuzberg
let extractors = try Kreuzberg.listDocumentExtractors()
let renderers = try Kreuzberg.listRenderers()
let processors = try Kreuzberg.listPostProcessors()
let ocrBackends = try Kreuzberg.listOcrBackends()
let validators = try Kreuzberg.listValidators()
let embeddingBackends = try Kreuzberg.listEmbeddingBackends()
print("Extractors: \(extractors)")
print("Renderers: \(renderers)")
print("Processors: \(processors)")
print("OCR backends: \(ocrBackends)")
print("Validators: \(validators)")
print("Embedding backends: \(embeddingBackends)")
```

View File

@@ -0,0 +1,48 @@
```swift title="Swift"
import Kreuzberg
final class MinLengthValidator: Validator {
let minLength: Int
init(minLength: Int = 100) {
self.minLength = minLength
}
func name() -> String {
"min_length_validator"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
100
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
// Returns JSON-encoded Result<(), String>
let contentLength = result.content().count
if contentLength < minLength {
let message = "Content too short: \(contentLength) < \(minLength)"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = MinLengthValidator(minLength: 100)
try Kreuzberg.registerValidator(validator)
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The FFI defines SwiftDocumentExtractorBox opaque type (packages/swift/rust/src/lib.rs),
// but swift-bridge's Swift code generator does not emit the protocol definition or
// factory required to construct and register instances from Swift.
//
// Workaround: Augment PDF extraction results by implementing a PostProcessor in Rust,
// or post-process ExtractionResult.metadata in Swift after extraction.
```

View File

@@ -0,0 +1,46 @@
```swift title="Swift"
import Kreuzberg
final class PdfOnlyProcessor: PostProcessor {
func name() -> String {
"pdf-only-processor"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"middle" // ProcessingStage enum name
}
func priority() -> Int32 {
50 // Default priority
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
// Returns JSON-encoded Result<(), String>
// No-op post-processor for PDF-only processing
"{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
result.mimeType() == "application/pdf"
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
0 // No processing overhead
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let processor = PdfOnlyProcessor()
try Kreuzberg.registerPostProcessor(processor)
```

View File

@@ -0,0 +1,13 @@
<!-- snippet:skip reason="swift-bridge 0.1.59 does not expose SwiftDocumentExtractorBox constructor or protocol definition in generated Swift code. Custom extractors must be implemented in Rust and registered via FFI shim." -->
```swift title="Swift"
import Kreuzberg
// Custom DocumentExtractor registration is not available from Swift.
//
// The Rust FFI (packages/swift/rust/src/lib.rs) accepts SwiftDocumentExtractorBox,
// but swift-bridge does not generate the Swift-side protocol definition or
// constructor required to implement and register instances.
//
// Solution: Implement DocumentExtractor in Rust and wrap it in a Rust FFI shim
// that links both `kreuzberg` and the `kreuzberg-swift` package.
```

View File

@@ -0,0 +1,56 @@
```swift title="Swift"
import Kreuzberg
import os.log
let logger = Logger(subsystem: "com.example.plugins", category: "MyPlugin")
final class MyPlugin: PostProcessor {
func name() -> String {
"my-plugin"
}
func version() -> String {
"1.0.0"
}
func initialize() -> String {
logger.info("Initializing plugin: my-plugin")
return "{\"ok\": null}"
}
func shutdown() -> String {
logger.info("Shutting down plugin: my-plugin")
return "{\"ok\": null}"
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLen = result.content().count
logger.info("Processing \(result.mimeType()) (\(contentLen) bytes)")
if contentLen == 0 {
logger.warning("Processing resulted in empty content")
}
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func processingStage() -> String {
"early"
}
func priority() -> Int32 {
50
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
10
}
}
let plugin = MyPlugin()
try Kreuzberg.registerPostProcessor(plugin)
```

View File

@@ -0,0 +1,46 @@
```swift title="Swift"
import Kreuzberg
import Testing
// Unit test a Swift Validator implementation
final class MinLengthValidator: Validator {
let minLength: Int
init(minLength: Int = 100) {
self.minLength = minLength
}
func name() -> String { "test-validator" }
func version() -> String { "1.0.0" }
func priority() -> Int32 { 50 }
func initialize() -> String { "{\"ok\": null}" }
func shutdown() -> String { "{\"ok\": null}" }
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLength = result.content().count
if contentLength < minLength {
return "{\"err\": \"Content too short: \(contentLength) < \(minLength)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
}
// Unit test the validator by directly testing its logic.
// Integration tests exercise validators in-pipeline during extraction.
let validator = MinLengthValidator(minLength: 100)
// Create extraction config and result via the binding
let configJson = "{\"use_cache\": false}"
let config = try extractionConfigFromJson(configJson)
// Extract a document; the validator runs automatically during extraction
let result = try extractFile(path: "test.txt", mimeType: "text/plain", config: config)
// The validator's validate() method is invoked in-pipeline.
// If it rejects, the extraction throws an error.
```

View File

@@ -0,0 +1,55 @@
```swift title="Swift"
import Kreuzberg
final class MinLengthValidator: Validator {
func name() -> String {
"min_length"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
50
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
let contentLength = result.content().count
if contentLength < 50 {
let message = "Content too short: \(contentLength)"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = MinLengthValidator()
try Kreuzberg.registerValidator(validator)
// Extract a file; the validator runs in-pipeline during extraction
let config = ExtractionConfig(
useCache: false,
enableQualityProcessing: false,
resultFormat: .unified,
outputFormat: .markdown
)
let result = try extractFileSync(
path: "document.pdf",
mimeType: nil,
config: config
)
print("Content length: \(result.content().count)")
```

View File

@@ -0,0 +1,54 @@
```swift title="Swift"
import Kreuzberg
final class QualityValidator: Validator {
let threshold: Double = 0.5
func name() -> String {
"quality-validator"
}
func version() -> String {
"1.0.0"
}
func priority() -> Int32 {
75
}
func validate(result: ExtractionResult, config: ExtractionConfig) -> String {
// Parse metadata to extract quality score
let metadata = result.metadata()
let qualityScore: Double
if let scoreStr = metadata["quality_score"] as? String,
let score = Double(scoreStr) {
qualityScore = score
} else {
qualityScore = 0.0
}
if qualityScore < threshold {
let message = "Quality score too low: \(String(format: "%.2f", qualityScore))"
return "{\"err\": \"\(message)\"}"
}
return "{\"ok\": null}"
}
func shouldValidate(result: ExtractionResult, config: ExtractionConfig) -> Bool {
// Only validate if quality processing was enabled
config.enableQualityProcessing()
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let validator = QualityValidator()
try Kreuzberg.registerValidator(validator)
```

View File

@@ -0,0 +1,66 @@
```swift title="Swift"
import Kreuzberg
import os.lock
final class StatefulPlugin: PostProcessor {
private var lock = NSLock()
private var callCount: Int = 0
private var cache: [String: String] = [:]
func name() -> String {
"stateful-plugin"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"middle"
}
func priority() -> Int32 {
50
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
lock.lock()
defer { lock.unlock() }
callCount += 1
cache["last_mime"] = result.mimeType()
cache["call_count"] = String(callCount)
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
true
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
1 // Minimal overhead
}
func initialize() -> String {
lock.lock()
defer { lock.unlock() }
callCount = 0
cache.removeAll()
return "{\"ok\": null}"
}
func shutdown() -> String {
lock.lock()
defer { lock.unlock() }
let finalCount = callCount
cache.removeAll()
let message = "Processed \(finalCount) extractions"
print(message)
return "{\"ok\": null}"
}
}
let plugin = StatefulPlugin()
try Kreuzberg.registerPostProcessor(plugin)
```

View File

@@ -0,0 +1,17 @@
```swift title="Swift"
import Kreuzberg
let names = [
"custom-json-extractor",
"word_count",
"cloud-ocr",
"min_length_validator",
]
try Kreuzberg.unregisterDocumentExtractor(names[0])
try Kreuzberg.unregisterPostProcessor(names[1])
try Kreuzberg.unregisterOcrBackend(names[2])
try Kreuzberg.unregisterValidator(names[3])
print("Plugins unregistered")
```

View File

@@ -0,0 +1,49 @@
```swift title="Swift"
import Kreuzberg
final class WordCountProcessor: PostProcessor {
func name() -> String {
"word_count"
}
func version() -> String {
"1.0.0"
}
func processingStage() -> String {
"early"
}
func priority() -> Int32 {
50
}
func process(result: ExtractionResult, config: ExtractionConfig) -> String {
let content = result.content()
let words = content.split(separator: " ").count
// Metadata is not directly mutable via the FFI, so store in logs or use
// a side-channel approach. For now, just track that processing happened.
return "{\"ok\": null}"
}
func shouldProcess(result: ExtractionResult, config: ExtractionConfig) -> Bool {
!result.content().isEmpty
}
func estimatedDurationMs(result: ExtractionResult) -> UInt64 {
5
}
func initialize() -> String {
"{\"ok\": null}"
}
func shutdown() -> String {
"{\"ok\": null}"
}
}
let processor = WordCountProcessor()
try Kreuzberg.registerPostProcessor(processor)
```