This commit is contained in:
25
docs/snippets/swift/metadata/language_detection.md
Normal file
25
docs/snippets/swift/metadata/language_detection.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": false
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
} else {
|
||||
print("No languages detected")
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,23 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"language_detection": {
|
||||
"enabled": true,
|
||||
"min_confidence": 0.8,
|
||||
"detect_multiple": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("multilingual_document.pdf", nil, config)
|
||||
|
||||
if let languages = result.detected_languages() {
|
||||
let langs = languages.map { $0.toString() }
|
||||
print("Detected languages: \(langs)")
|
||||
}
|
||||
```
|
||||
43
docs/snippets/swift/metadata/metadata.md
Normal file
43
docs/snippets/swift/metadata/metadata.md
Normal file
@@ -0,0 +1,43 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let metadata = result.metadata()
|
||||
|
||||
if let title = metadata.title() {
|
||||
print("Title: \(title.toString())")
|
||||
}
|
||||
if let subject = metadata.subject() {
|
||||
print("Subject: \(subject.toString())")
|
||||
}
|
||||
if let language = metadata.language() {
|
||||
print("Language: \(language.toString())")
|
||||
}
|
||||
if let createdAt = metadata.created_at() {
|
||||
print("Created at: \(createdAt.toString())")
|
||||
}
|
||||
if let modifiedAt = metadata.modified_at() {
|
||||
print("Modified at: \(modifiedAt.toString())")
|
||||
}
|
||||
if let createdBy = metadata.created_by() {
|
||||
print("Created by: \(createdBy.toString())")
|
||||
}
|
||||
if let authors = metadata.authors() {
|
||||
let names = authors.map { $0.toString() }
|
||||
print("Authors: \(names)")
|
||||
}
|
||||
if let keywords = metadata.keywords() {
|
||||
let words = keywords.map { $0.toString() }
|
||||
print("Keywords: \(words)")
|
||||
}
|
||||
if let duration = metadata.extraction_duration_ms() {
|
||||
print("Extraction duration (ms): \(duration)")
|
||||
}
|
||||
if let pages = metadata.pages() {
|
||||
print("Page count: \(pages.total_count())")
|
||||
}
|
||||
```
|
||||
35
docs/snippets/swift/metadata/page_boundaries.md
Normal file
35
docs/snippets/swift/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,35 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let content = result.content().toString()
|
||||
let utf8 = Array(content.utf8)
|
||||
|
||||
guard let pageStructure = result.metadata().pages() else {
|
||||
print("No page structure available")
|
||||
exit(0)
|
||||
}
|
||||
guard let boundaries = pageStructure.boundaries() else {
|
||||
print("No page boundaries available")
|
||||
exit(0)
|
||||
}
|
||||
|
||||
for (index, boundary) in boundaries.enumerated() {
|
||||
if index >= 3 { break }
|
||||
|
||||
let byteStart = boundary.byte_start()
|
||||
let byteEnd = boundary.byte_end()
|
||||
let pageBytes = Array(utf8[byteStart..<byteEnd])
|
||||
let pageText = String(bytes: pageBytes, encoding: .utf8) ?? ""
|
||||
let previewEnd = min(100, pageText.count)
|
||||
let preview = String(pageText.prefix(previewEnd))
|
||||
|
||||
print("Page \(boundary.page_number()):")
|
||||
print(" Byte range: \(byteStart)-\(byteEnd)")
|
||||
print(" Preview: \(preview)...")
|
||||
}
|
||||
```
|
||||
28
docs/snippets/swift/metadata/page_tracking_basic.md
Normal file
28
docs/snippets/swift/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"pages": {
|
||||
"extract_pages": true
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
if let pages = result.pages() {
|
||||
for page in pages {
|
||||
let pageContent = page.content().toString()
|
||||
print("Page \(page.page_number()):")
|
||||
print(" Content: \(pageContent.count) chars")
|
||||
print(" Tables: \(page.tables().count)")
|
||||
print(" Images: \(page.images().count)")
|
||||
}
|
||||
} else {
|
||||
print("No per-page content available")
|
||||
}
|
||||
```
|
||||
20
docs/snippets/swift/metadata/tables.md
Normal file
20
docs/snippets/swift/metadata/tables.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
let config = try extractionConfigFromJson("{}")
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
let tables = result.tables()
|
||||
print("Tables: \(tables.count)")
|
||||
|
||||
for (index, table) in tables.enumerated() {
|
||||
print("Table \(index) on page \(table.page_number())")
|
||||
print(table.markdown().toString())
|
||||
|
||||
if let bbox = table.bounding_box() {
|
||||
print(" Bounding box: \(bbox.toString())")
|
||||
}
|
||||
}
|
||||
```
|
||||
55
docs/snippets/swift/metadata/vector_database_integration.md
Normal file
55
docs/snippets/swift/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,55 @@
|
||||
```swift title="Swift"
|
||||
import Foundation
|
||||
import Kreuzberg
|
||||
import RustBridge
|
||||
|
||||
struct VectorRecord {
|
||||
let id: String
|
||||
let content: String
|
||||
let embedding: [Float]
|
||||
let metadata: [String: String]
|
||||
}
|
||||
|
||||
let configJson = """
|
||||
{
|
||||
"chunking": {
|
||||
"max_characters": 512,
|
||||
"overlap": 50,
|
||||
"embedding": {
|
||||
"model": {"preset": {"name": "balanced"}},
|
||||
"batch_size": 32,
|
||||
"normalize": true
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
let documentId = "doc_001"
|
||||
let config = try extractionConfigFromJson(configJson)
|
||||
let result = try extractFileSync("document.pdf", nil, config)
|
||||
|
||||
var records: [VectorRecord] = []
|
||||
|
||||
if let chunks = result.chunks() {
|
||||
for (index, chunk) in chunks.enumerated() {
|
||||
guard let embedding = chunk.embedding() else { continue }
|
||||
|
||||
let content = chunk.content().toString()
|
||||
let vector = embedding.map { $0 }
|
||||
|
||||
var metadata: [String: String] = [:]
|
||||
metadata["document_id"] = documentId
|
||||
metadata["chunk_index"] = String(index)
|
||||
metadata["content_length"] = String(content.count)
|
||||
|
||||
records.append(VectorRecord(
|
||||
id: "\(documentId)_chunk_\(index)",
|
||||
content: content,
|
||||
embedding: vector,
|
||||
metadata: metadata
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
print("Generated \(records.count) vector records")
|
||||
```
|
||||
Reference in New Issue
Block a user