This commit is contained in:
34
docs/snippets/swift/advanced/chunk_page_mapping.md
Normal file
34
docs/snippets/swift/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50
|
||||
},
|
||||
"pages": {
|
||||
"extract_pages": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
let content = chunk.content().toString()
|
||||
let preview = String(content.prefix(50))
|
||||
if let first = metadata.first_page(), let last = metadata.last_page() {
|
||||
let pageRange = first == last ? "Page \(first)" : "Pages \(first)-\(last)"
|
||||
print("Chunk: \(preview)... (\(pageRange))")
|
||||
} else {
|
||||
print("Chunk: \(preview)... (no page info)")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/swift/advanced/chunking_config.md
Normal file
25
docs/snippets/swift/advanced/chunking_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1000,
|
||||
"overlap": 200
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Chunks: \(chunks.count)")
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
}
|
||||
}
|
||||
```
|
||||
35
docs/snippets/swift/advanced/chunking_rag.md
Normal file
35
docs/snippets/swift/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 500,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for chunk in chunks {
|
||||
let metadata = chunk.metadata()
|
||||
let content = chunk.content().toString()
|
||||
let preview = String(content.prefix(100))
|
||||
print("Chunk \(metadata.chunk_index() + 1)/\(metadata.total_chunks())")
|
||||
print("Position: \(metadata.byte_start())-\(metadata.byte_end())")
|
||||
print("Content: \(preview)...")
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Embedding: \(embedding.count) dimensions")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
32
docs/snippets/swift/advanced/embedding_with_chunking.md
Normal file
32
docs/snippets/swift/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 1024,
|
||||
"overlap": 100,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32,
|
||||
"show_download_progress": false
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
print("Generated \(chunks.count) chunks")
|
||||
for chunk in chunks {
|
||||
if let embedding = chunk.embedding() {
|
||||
print("Chunk \(chunk.metadata().chunk_index()) -> \(embedding.count)-dim embedding")
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
24
docs/snippets/swift/advanced/keyword_extraction_config.md
Normal file
24
docs/snippets/swift/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.3,
|
||||
"ngram_range": [1, 3],
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let keywords = result.extracted_keywords() {
|
||||
print("Extracted \(keywords.count) keywords")
|
||||
}
|
||||
```
|
||||
26
docs/snippets/swift/advanced/keyword_extraction_example.md
Normal file
26
docs/snippets/swift/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"keywords": {
|
||||
"algorithm": "yake",
|
||||
"max_keywords": 10,
|
||||
"min_score": 0.3
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("research_paper.pdf", nil, config)
|
||||
|
||||
if let keywords = result.extracted_keywords() {
|
||||
for keyword in keywords {
|
||||
let text = keyword.text().toString()
|
||||
let score = keyword.score()
|
||||
print("\(text) (score: \(score))")
|
||||
}
|
||||
}
|
||||
```
|
||||
23
docs/snippets/swift/advanced/language_detection_config.md
Normal file
23
docs/snippets/swift/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected: \(langs)")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("multilingual_document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
18
docs/snippets/swift/advanced/quality_processing_config.md
Normal file
18
docs/snippets/swift/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let score = result.quality_score() {
|
||||
print(String(format: "Quality score: %.2f", score))
|
||||
}
|
||||
```
|
||||
22
docs/snippets/swift/advanced/quality_processing_example.md
Normal file
22
docs/snippets/swift/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"enable_quality_processing": true
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("scanned_document.pdf", nil, config)
|
||||
|
||||
if let score = result.quality_score() {
|
||||
if score < 0.5 {
|
||||
print(String(format: "Warning: Low quality extraction (%.2f)", score))
|
||||
} else {
|
||||
print(String(format: "Quality score: %.2f", score))
|
||||
}
|
||||
}
|
||||
```
|
||||
21
docs/snippets/swift/advanced/token_reduction_config.md
Normal file
21
docs/snippets/swift/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true,
|
||||
"preserve_code": true,
|
||||
"language_hint": "eng"
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
print("Reduced content length: \(result.content().toString().count)")
|
||||
```
|
||||
23
docs/snippets/swift/advanced/token_reduction_example.md
Normal file
23
docs/snippets/swift/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"token_reduction": {
|
||||
"mode": "moderate",
|
||||
"preserve_markdown": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("verbose_document.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
print("Reduced content length: \(content.count)")
|
||||
for warning in result.processing_warnings() {
|
||||
print("Warning [\(warning.source().toString())]: \(warning.message().toString())")
|
||||
}
|
||||
```
|
||||
51
docs/snippets/swift/advanced/vector_database_integration.md
Normal file
51
docs/snippets/swift/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,51 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
struct VectorRecord {
|
||||
let id: String
|
||||
let content: String
|
||||
let embedding: [Float]
|
||||
let metadata: [String: String]
|
||||
}
|
||||
|
||||
func extractAndVectorize(documentPath: String, documentId: String) throws -> [VectorRecord] {
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 512,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"normalize": true,
|
||||
"batch_size": 32
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync(documentPath, nil, config)
|
||||
|
||||
var records: [VectorRecord] = []
|
||||
if let chunks = result.chunks() {
|
||||
for (index, chunk) in chunks.enumerated() {
|
||||
guard let embedding = chunk.embedding() else { continue }
|
||||
let content = chunk.content().toString()
|
||||
let metadata: [String: String] = [
|
||||
"document_id": documentId,
|
||||
"chunk_index": String(index),
|
||||
"content_length": String(content.count),
|
||||
]
|
||||
records.append(VectorRecord(
|
||||
id: "\(documentId)_chunk_\(index)",
|
||||
content: content,
|
||||
embedding: embedding.map { $0 },
|
||||
metadata: metadata
|
||||
))
|
||||
}
|
||||
}
|
||||
return records
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user