This commit is contained in:
63
docs/snippets/zig/advanced/chunk_page_mapping.md
Normal file
63
docs/snippets/zig/advanced/chunk_page_mapping.md
Normal file
@@ -0,0 +1,63 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 500,
|
||||
\\ "overlap": 50
|
||||
\\ },
|
||||
\\ "pages": {
|
||||
\\ "extract_pages": true,
|
||||
\\ "insert_page_markers": false,
|
||||
\\ "marker_format": ""
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const chunks_val = root.object.get("chunks") orelse return;
|
||||
if (chunks_val != .array) return;
|
||||
|
||||
for (chunks_val.array.items) |chunk| {
|
||||
if (chunk != .object) continue;
|
||||
|
||||
const metadata_val = chunk.object.get("metadata") orelse continue;
|
||||
if (metadata_val != .object) continue;
|
||||
|
||||
const first_page_val = metadata_val.object.get("first_page") orelse continue;
|
||||
const last_page_val = metadata_val.object.get("last_page") orelse continue;
|
||||
if (first_page_val != .integer or last_page_val != .integer) continue;
|
||||
|
||||
const first = first_page_val.integer;
|
||||
const last = last_page_val.integer;
|
||||
|
||||
if (chunk.object.get("content")) |content_val| {
|
||||
if (content_val == .string) {
|
||||
const preview_len = @min(50, content_val.string.len);
|
||||
if (first == last) {
|
||||
try stdout.print("Chunk: {s}... (Page {d})\n", .{ content_val.string[0..preview_len], first });
|
||||
} else {
|
||||
try stdout.print("Chunk: {s}... (Pages {d}-{d})\n", .{ content_val.string[0..preview_len], first, last });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
54
docs/snippets/zig/advanced/chunking_config.md
Normal file
54
docs/snippets/zig/advanced/chunking_config.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1000,
|
||||
\\ "overlap": 200,
|
||||
\\ "chunker_type": "markdown",
|
||||
\\ "prepend_heading_context": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
|
||||
```zig title="Zig - Semantic"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1000,
|
||||
\\ "overlap": 200,
|
||||
\\ "chunker_type": "semantic"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
54
docs/snippets/zig/advanced/chunking_rag.md
Normal file
54
docs/snippets/zig/advanced/chunking_rag.md
Normal file
@@ -0,0 +1,54 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 500,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "normalize": true
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const chunks_val = root.object.get("chunks") orelse return;
|
||||
if (chunks_val != .array) return;
|
||||
|
||||
for (chunks_val.array.items, 0..) |chunk, index| {
|
||||
if (chunk != .object) continue;
|
||||
|
||||
if (chunk.object.get("content")) |content_val| {
|
||||
if (content_val == .string) {
|
||||
const preview_len = @min(100, content_val.string.len);
|
||||
try stdout.print("Chunk {d}: {s}...\n", .{ index, content_val.string[0..preview_len] });
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk.object.get("embedding")) |embedding_val| {
|
||||
if (embedding_val == .array) {
|
||||
try stdout.print(" Embedding: {d} dimensions\n", .{embedding_val.array.items.len});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
31
docs/snippets/zig/advanced/embedding_with_chunking.md
Normal file
31
docs/snippets/zig/advanced/embedding_with_chunking.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1024,
|
||||
\\ "overlap": 100,
|
||||
\\ "embedding": {
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "normalize": true,
|
||||
\\ "batch_size": 32,
|
||||
\\ "show_download_progress": false
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/advanced/keyword_extraction_config.md
Normal file
28
docs/snippets/zig/advanced/keyword_extraction_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.3,
|
||||
\\ "ngram_range": [1, 3],
|
||||
\\ "language": "en"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
51
docs/snippets/zig/advanced/keyword_extraction_example.md
Normal file
51
docs/snippets/zig/advanced/keyword_extraction_example.md
Normal file
@@ -0,0 +1,51 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.3,
|
||||
\\ "ngram_range": [1, 3]
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const keywords_val = root.object.get("extracted_keywords") orelse return;
|
||||
if (keywords_val != .array) return;
|
||||
|
||||
for (keywords_val.array.items) |keyword| {
|
||||
if (keyword != .object) continue;
|
||||
|
||||
const text_val = keyword.object.get("text") orelse continue;
|
||||
const score_val = keyword.object.get("score") orelse continue;
|
||||
if (text_val != .string) continue;
|
||||
|
||||
const score: f64 = switch (score_val) {
|
||||
.float => |f| f,
|
||||
.integer => |i| @floatFromInt(i),
|
||||
else => continue,
|
||||
};
|
||||
|
||||
try stdout.print("{s}: {d:.4}\n", .{ text_val.string, score });
|
||||
}
|
||||
}
|
||||
```
|
||||
26
docs/snippets/zig/advanced/language_detection_config.md
Normal file
26
docs/snippets/zig/advanced/language_detection_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.8,
|
||||
\\ "detect_multiple": false
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,43 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.8,
|
||||
\\ "detect_multiple": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("multilingual_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
try stdout.print("Detected languages:", .{});
|
||||
if (root.object.get("detected_languages")) |languages_val| {
|
||||
if (languages_val == .array) {
|
||||
for (languages_val.array.items) |lang_val| {
|
||||
if (lang_val == .string) {
|
||||
try stdout.print(" {s}", .{lang_val.string});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try stdout.print("\n", .{});
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/advanced/quality_processing_config.md
Normal file
22
docs/snippets/zig/advanced/quality_processing_config.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "enable_quality_processing": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
41
docs/snippets/zig/advanced/quality_processing_example.md
Normal file
41
docs/snippets/zig/advanced/quality_processing_example.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "enable_quality_processing": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
if (root.object.get("quality_score")) |score_val| {
|
||||
const score: f64 = switch (score_val) {
|
||||
.float => |f| f,
|
||||
.integer => |i| @floatFromInt(i),
|
||||
else => return,
|
||||
};
|
||||
|
||||
if (score < 0.5) {
|
||||
try stdout.print("Warning: Low quality extraction ({d:.2})\n", .{score});
|
||||
} else {
|
||||
try stdout.print("Quality score: {d:.2}\n", .{score});
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/advanced/token_reduction_config.md
Normal file
25
docs/snippets/zig/advanced/token_reduction_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
}
|
||||
```
|
||||
42
docs/snippets/zig/advanced/token_reduction_example.md
Normal file
42
docs/snippets/zig/advanced/token_reduction_example.md
Normal file
@@ -0,0 +1,42 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("verbose_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
if (root.object.get("original_token_count")) |val| {
|
||||
if (val == .integer) {
|
||||
try stdout.print("Original tokens: {d}\n", .{val.integer});
|
||||
}
|
||||
}
|
||||
|
||||
if (root.object.get("reduced_token_count")) |val| {
|
||||
if (val == .integer) {
|
||||
try stdout.print("Reduced tokens: {d}\n", .{val.integer});
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
60
docs/snippets/zig/advanced/vector_database_integration.md
Normal file
60
docs/snippets/zig/advanced/vector_database_integration.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const document_id = "doc_001";
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 512,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "normalize": true,
|
||||
\\ "batch_size": 32
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const chunks_val = root.object.get("chunks") orelse return;
|
||||
if (chunks_val != .array) return;
|
||||
|
||||
for (chunks_val.array.items, 0..) |chunk, index| {
|
||||
if (chunk != .object) continue;
|
||||
|
||||
const embedding_val = chunk.object.get("embedding") orelse continue;
|
||||
if (embedding_val != .array) continue;
|
||||
|
||||
const content_val = chunk.object.get("content") orelse continue;
|
||||
if (content_val != .string) continue;
|
||||
|
||||
const record_id = try std.fmt.allocPrint(allocator, "{s}_chunk_{d}", .{ document_id, index });
|
||||
defer allocator.free(record_id);
|
||||
|
||||
try stdout.print("id={s} dims={d} content_length={d}\n", .{
|
||||
record_id,
|
||||
embedding_val.array.items.len,
|
||||
content_val.string.len,
|
||||
});
|
||||
// Persist record_id, content_val.string, and embedding_val.array.items in a vector database.
|
||||
}
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/api/batch_extract_bytes_sync.md
Normal file
22
docs/snippets/zig/api/batch_extract_bytes_sync.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
// Batch items are passed as a JSON-encoded array across the FFI boundary.
|
||||
// `content` is base64-encoded bytes per the FFI schema for BatchBytesItem.
|
||||
const items_json =
|
||||
\\[
|
||||
\\ {"content": "SGVsbG8sIHdvcmxkIQ==", "mime_type": "text/plain", "config": null},
|
||||
\\ {"content": "IyBIZWFkaW5nCgpQYXJhZ3JhcGggdGV4dC4=", "mime_type": "text/markdown", "config": null}
|
||||
\\]
|
||||
;
|
||||
const config_json = "{}";
|
||||
|
||||
const results_json = try kreuzberg.batch_extract_bytes_sync(items_json, config_json);
|
||||
defer std.heap.c_allocator.free(results_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{results_json});
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/api/batch_extract_files_sync.md
Normal file
22
docs/snippets/zig/api/batch_extract_files_sync.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
// Batch items are passed as a JSON-encoded array across the FFI boundary.
|
||||
const items_json =
|
||||
\\[
|
||||
\\ {"path": "doc1.pdf", "config": null},
|
||||
\\ {"path": "doc2.docx", "config": null},
|
||||
\\ {"path": "report.pdf", "config": null}
|
||||
\\]
|
||||
;
|
||||
const config_json = "{}";
|
||||
|
||||
const results_json = try kreuzberg.batch_extract_files_sync(items_json, config_json);
|
||||
defer std.heap.c_allocator.free(results_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{results_json});
|
||||
}
|
||||
```
|
||||
53
docs/snippets/zig/api/client_chunk_text.md
Normal file
53
docs/snippets/zig/api/client_chunk_text.md
Normal file
@@ -0,0 +1,53 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const file_bytes = try std.fs.cwd().readFileAlloc(allocator, "document.pdf", 64 * 1024 * 1024);
|
||||
defer allocator.free(file_bytes);
|
||||
|
||||
const boundary = "----kreuzberg-zig-boundary";
|
||||
var body = std.ArrayList(u8).init(allocator);
|
||||
defer body.deinit();
|
||||
|
||||
try body.writer().print(
|
||||
"--{s}\r\nContent-Disposition: form-data; name=\"file\"; filename=\"document.pdf\"\r\n" ++
|
||||
"Content-Type: application/pdf\r\n\r\n",
|
||||
.{boundary},
|
||||
);
|
||||
try body.appendSlice(file_bytes);
|
||||
try body.writer().print(
|
||||
"\r\n--{s}\r\nContent-Disposition: form-data; name=\"chunking\"\r\n\r\n" ++
|
||||
"{{\"max_characters\":800,\"overlap\":100}}\r\n--{s}--\r\n",
|
||||
.{ boundary, boundary },
|
||||
);
|
||||
|
||||
var client = std.http.Client{ .allocator = allocator };
|
||||
defer client.deinit();
|
||||
|
||||
const uri = try std.Uri.parse("http://localhost:8000/extract");
|
||||
var header_buf: [4096]u8 = undefined;
|
||||
var req = try client.open(.POST, uri, .{
|
||||
.server_header_buffer = &header_buf,
|
||||
.extra_headers = &.{
|
||||
.{ .name = "content-type", .value = "multipart/form-data; boundary=" ++ boundary },
|
||||
},
|
||||
});
|
||||
defer req.deinit();
|
||||
|
||||
req.transfer_encoding = .{ .content_length = body.items.len };
|
||||
try req.send();
|
||||
try req.writeAll(body.items);
|
||||
try req.finish();
|
||||
try req.wait();
|
||||
|
||||
const response_body = try req.reader().readAllAlloc(allocator, 16 * 1024 * 1024);
|
||||
defer allocator.free(response_body);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{response_body});
|
||||
}
|
||||
```
|
||||
49
docs/snippets/zig/api/client_extract_single_file.md
Normal file
49
docs/snippets/zig/api/client_extract_single_file.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const file_bytes = try std.fs.cwd().readFileAlloc(allocator, "document.pdf", 64 * 1024 * 1024);
|
||||
defer allocator.free(file_bytes);
|
||||
|
||||
const boundary = "----kreuzberg-zig-boundary";
|
||||
var body = std.ArrayList(u8).init(allocator);
|
||||
defer body.deinit();
|
||||
|
||||
try body.writer().print(
|
||||
"--{s}\r\nContent-Disposition: form-data; name=\"file\"; filename=\"document.pdf\"\r\n" ++
|
||||
"Content-Type: application/pdf\r\n\r\n",
|
||||
.{boundary},
|
||||
);
|
||||
try body.appendSlice(file_bytes);
|
||||
try body.writer().print("\r\n--{s}--\r\n", .{boundary});
|
||||
|
||||
var client = std.http.Client{ .allocator = allocator };
|
||||
defer client.deinit();
|
||||
|
||||
const uri = try std.Uri.parse("http://localhost:8000/extract");
|
||||
var header_buf: [4096]u8 = undefined;
|
||||
var req = try client.open(.POST, uri, .{
|
||||
.server_header_buffer = &header_buf,
|
||||
.extra_headers = &.{
|
||||
.{ .name = "content-type", .value = "multipart/form-data; boundary=" ++ boundary },
|
||||
},
|
||||
});
|
||||
defer req.deinit();
|
||||
|
||||
req.transfer_encoding = .{ .content_length = body.items.len };
|
||||
try req.send();
|
||||
try req.writeAll(body.items);
|
||||
try req.finish();
|
||||
try req.wait();
|
||||
|
||||
const response_body = try req.reader().readAllAlloc(allocator, 16 * 1024 * 1024);
|
||||
defer allocator.free(response_body);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{response_body});
|
||||
}
|
||||
```
|
||||
37
docs/snippets/zig/api/combining_all_features.md
Normal file
37
docs/snippets/zig/api/combining_all_features.md
Normal file
@@ -0,0 +1,37 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
// Configuration is passed across the FFI as a JSON document.
|
||||
// This combines OCR, chunking, image extraction, output format, and caching.
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "use_cache": true,
|
||||
\\ "enable_quality_processing": true,
|
||||
\\ "force_ocr": false,
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ },
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 800,
|
||||
\\ "overlap": 100,
|
||||
\\ "chunker_type": "markdown",
|
||||
\\ "prepend_heading_context": true
|
||||
\\ },
|
||||
\\ "images": {
|
||||
\\ "extract_images": true
|
||||
\\ },
|
||||
\\ "output_format": "markdown",
|
||||
\\ "include_document_structure": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("report.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Result ({d} bytes of JSON):\n{s}\n", .{ result_json.len, result_json });
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/api/error_handling.md
Normal file
28
docs/snippets/zig/api/error_handling.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json = "{}";
|
||||
const result_json = kreuzberg.extract_file_sync("document.pdf", null, config_json) catch |err| {
|
||||
const stderr = std.io.getStdErr().writer();
|
||||
switch (err) {
|
||||
error.Io => try stderr.print("File error\n", .{}),
|
||||
error.UnsupportedFormat => try stderr.print("Unsupported format\n", .{}),
|
||||
error.Parsing => try stderr.print("Corrupt or invalid document\n", .{}),
|
||||
error.MissingDependency => try stderr.print("Missing dependency — install required backend\n", .{}),
|
||||
error.Ocr => try stderr.print("OCR processing failed\n", .{}),
|
||||
error.OutOfMemory => try stderr.print("Out of memory\n", .{}),
|
||||
else => try stderr.print("Extraction failed: {s}\n", .{@errorName(err)}),
|
||||
}
|
||||
if (kreuzberg._last_error()) |context| {
|
||||
try stderr.print(" context: {s}\n", .{context});
|
||||
}
|
||||
return;
|
||||
};
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
33
docs/snippets/zig/api/error_handling_extract.md
Normal file
33
docs/snippets/zig/api/error_handling_extract.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
fn extract_text(bytes: []const u8, mime_type: []const u8) ![]u8 {
|
||||
const config_json = "{}";
|
||||
return kreuzberg.extract_bytes_sync(bytes, mime_type, config_json);
|
||||
}
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const bytes = std.fs.cwd().readFileAlloc(allocator, "document.pdf", 64 * 1024 * 1024) catch &[_]u8{};
|
||||
defer if (bytes.len > 0) allocator.free(bytes);
|
||||
|
||||
const stderr = std.io.getStdErr().writer();
|
||||
const result_json = extract_text(bytes, "application/pdf") catch |err| {
|
||||
switch (err) {
|
||||
error.UnsupportedFormat => try stderr.print("Format not supported\n", .{}),
|
||||
error.Ocr => try stderr.print("OCR failed\n", .{}),
|
||||
error.Validation => try stderr.print("Invalid input or configuration\n", .{}),
|
||||
else => try stderr.print("Error: {s}\n", .{@errorName(err)}),
|
||||
}
|
||||
return;
|
||||
};
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Extracted {d} bytes of JSON\n", .{result_json.len});
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/api/extract_bytes_async.md
Normal file
25
docs/snippets/zig/api/extract_bytes_async.md
Normal file
@@ -0,0 +1,25 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Note: the Zig binding is sync-only. There is no `extract_bytes` async variant —
|
||||
// the FFI surface exposes blocking entry points that internally drive the global
|
||||
// Tokio runtime. Use `extract_bytes_sync` from any thread.
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const content = try std.fs.cwd().readFileAlloc(allocator, "document.pdf", 64 * 1024 * 1024);
|
||||
defer allocator.free(content);
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_bytes_sync(content, "application/pdf", config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
20
docs/snippets/zig/api/extract_bytes_sync.md
Normal file
20
docs/snippets/zig/api/extract_bytes_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const content = try std.fs.cwd().readFileAlloc(allocator, "document.pdf", 64 * 1024 * 1024);
|
||||
defer allocator.free(content);
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_bytes_sync(content, "application/pdf", config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
18
docs/snippets/zig/api/extract_file_async.md
Normal file
18
docs/snippets/zig/api/extract_file_async.md
Normal file
@@ -0,0 +1,18 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Note: the Zig binding is sync-only. There is no `extract_file` async variant —
|
||||
// the FFI surface exposes blocking entry points that internally drive the global
|
||||
// Tokio runtime. Use `extract_file_sync` from any thread.
|
||||
pub fn main() !void {
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
20
docs/snippets/zig/api/extract_file_sync.md
Normal file
20
docs/snippets/zig/api/extract_file_sync.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
58
docs/snippets/zig/config/advanced_config.md
Normal file
58
docs/snippets/zig/config/advanced_config.md
Normal file
@@ -0,0 +1,58 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "use_cache": true,
|
||||
\\ "enable_quality_processing": true,
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ },
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1000,
|
||||
\\ "overlap": 200,
|
||||
\\ "embedding": {
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "batch_size": 32,
|
||||
\\ "normalize": true
|
||||
\\ }
|
||||
\\ },
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.8,
|
||||
\\ "detect_multiple": false
|
||||
\\ },
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.1,
|
||||
\\ "ngram_range": [1, 3],
|
||||
\\ "language": "en"
|
||||
\\ },
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ },
|
||||
\\ "postprocessor": {
|
||||
\\ "enabled": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
30
docs/snippets/zig/config/chunking_config.md
Normal file
30
docs/snippets/zig/config/chunking_config.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1000,
|
||||
\\ "overlap": 200,
|
||||
\\ "chunker_type": "markdown",
|
||||
\\ "prepend_heading_context": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
26
docs/snippets/zig/config/config_basic.md
Normal file
26
docs/snippets/zig/config/config_basic.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "use_cache": true,
|
||||
\\ "enable_quality_processing": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/config/config_discover.md
Normal file
25
docs/snippets/zig/config/config_discover.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
// The Zig binding accepts JSON config strings. To use a discovered config
|
||||
// file, load it from disk into a string and pass it through unchanged.
|
||||
const cwd = std.fs.cwd();
|
||||
const config_json = cwd.readFileAlloc(allocator, "kreuzberg.json", 1 << 20) catch |err| switch (err) {
|
||||
error.FileNotFound => try allocator.dupe(u8, "{}"),
|
||||
else => return err,
|
||||
};
|
||||
defer allocator.free(config_json);
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/config/config_ocr.md
Normal file
28
docs/snippets/zig/config/config_ocr.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
39
docs/snippets/zig/config/config_programmatic.md
Normal file
39
docs/snippets/zig/config/config_programmatic.md
Normal file
@@ -0,0 +1,39 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
// Build the config JSON programmatically with std.json.
|
||||
var arena = std.heap.ArenaAllocator.init(allocator);
|
||||
defer arena.deinit();
|
||||
const a = arena.allocator();
|
||||
|
||||
var root = std.json.ObjectMap.init(a);
|
||||
try root.put("use_cache", std.json.Value{ .bool = true });
|
||||
try root.put("enable_quality_processing", std.json.Value{ .bool = true });
|
||||
|
||||
var ocr = std.json.ObjectMap.init(a);
|
||||
try ocr.put("backend", std.json.Value{ .string = "tesseract" });
|
||||
try ocr.put("language", std.json.Value{ .string = "eng+deu" });
|
||||
try root.put("ocr", std.json.Value{ .object = ocr });
|
||||
|
||||
var chunking = std.json.ObjectMap.init(a);
|
||||
try chunking.put("max_characters", std.json.Value{ .integer = 1000 });
|
||||
try chunking.put("overlap", std.json.Value{ .integer = 200 });
|
||||
try root.put("chunking", std.json.Value{ .object = chunking });
|
||||
|
||||
const config_value = std.json.Value{ .object = root };
|
||||
var buffer = std.ArrayList(u8).init(a);
|
||||
try std.json.stringify(config_value, .{}, buffer.writer());
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, buffer.items);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/config/document_structure_config.md
Normal file
25
docs/snippets/zig/config/document_structure_config.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "include_document_structure": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/config/element_based_output.md
Normal file
25
docs/snippets/zig/config/element_based_output.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "result_format": "element_based"
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
34
docs/snippets/zig/config/embedding_config.md
Normal file
34
docs/snippets/zig/config/embedding_config.md
Normal file
@@ -0,0 +1,34 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1000,
|
||||
\\ "overlap": 200,
|
||||
\\ "embedding": {
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "batch_size": 16,
|
||||
\\ "normalize": true,
|
||||
\\ "show_download_progress": true
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
29
docs/snippets/zig/config/html_output.md
Normal file
29
docs/snippets/zig/config/html_output.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "output_format": "html",
|
||||
\\ "html_output": {
|
||||
\\ "theme": "github",
|
||||
\\ "embed_css": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
31
docs/snippets/zig/config/keyword_extraction_config.md
Normal file
31
docs/snippets/zig/config/keyword_extraction_config.md
Normal file
@@ -0,0 +1,31 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.1,
|
||||
\\ "ngram_range": [1, 3],
|
||||
\\ "language": "en"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
29
docs/snippets/zig/config/language_detection_config.md
Normal file
29
docs/snippets/zig/config/language_detection_config.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.8,
|
||||
\\ "detect_multiple": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
32
docs/snippets/zig/config/ocr_dpi_config.md
Normal file
32
docs/snippets/zig/config/ocr_dpi_config.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "images": {
|
||||
\\ "extract_images": true,
|
||||
\\ "target_dpi": 300,
|
||||
\\ "max_image_dimension": 4096,
|
||||
\\ "auto_adjust_dpi": true,
|
||||
\\ "min_dpi": 150,
|
||||
\\ "max_dpi": 600
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
30
docs/snippets/zig/config/pdf_config.md
Normal file
30
docs/snippets/zig/config/pdf_config.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "pdf_options": {
|
||||
\\ "extract_images": true,
|
||||
\\ "passwords": ["password123"],
|
||||
\\ "extract_metadata": true,
|
||||
\\ "extract_annotations": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("encrypted.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
32
docs/snippets/zig/config/pdf_hierarchy_config.md
Normal file
32
docs/snippets/zig/config/pdf_hierarchy_config.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "pdf_options": {
|
||||
\\ "hierarchy": {
|
||||
\\ "enabled": true,
|
||||
\\ "k_clusters": 6,
|
||||
\\ "include_bbox": true,
|
||||
\\ "ocr_coverage_threshold": 0.5
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/config/postprocessor_config.md
Normal file
28
docs/snippets/zig/config/postprocessor_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "postprocessor": {
|
||||
\\ "enabled": true,
|
||||
\\ "enabled_processors": ["whitespace_normalizer", "unicode_normalizer"]
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
26
docs/snippets/zig/config/quality_processing_config.md
Normal file
26
docs/snippets/zig/config/quality_processing_config.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "enable_quality_processing": true,
|
||||
\\ "use_cache": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
33
docs/snippets/zig/config/tesseract_config.md
Normal file
33
docs/snippets/zig/config/tesseract_config.md
Normal file
@@ -0,0 +1,33 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng+deu",
|
||||
\\ "tesseract_config": {
|
||||
\\ "language": "eng+deu",
|
||||
\\ "psm": 6,
|
||||
\\ "oem": 3
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/config/token_reduction_config.md
Normal file
28
docs/snippets/zig/config/token_reduction_config.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
18
docs/snippets/zig/getting-started/basic_usage.md
Normal file
18
docs/snippets/zig/getting-started/basic_usage.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
_ = allocator;
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
25
docs/snippets/zig/getting-started/extract_file.md
Normal file
25
docs/snippets/zig/getting-started/extract_file.md
Normal file
@@ -0,0 +1,25 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value.object;
|
||||
const content = root.get("content") orelse std.json.Value{ .string = "" };
|
||||
const mime_type = root.get("mime_type") orelse std.json.Value{ .string = "" };
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Content: {s}\n", .{content.string});
|
||||
try stdout.print("MIME Type: {s}\n", .{mime_type.string});
|
||||
}
|
||||
```
|
||||
27
docs/snippets/zig/getting-started/extract_with_ocr.md
Normal file
27
docs/snippets/zig/getting-started/extract_with_ocr.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
_ = allocator;
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "force_ocr": true,
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
20
docs/snippets/zig/getting-started/hello_world.md
Normal file
20
docs/snippets/zig/getting-started/hello_world.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
_ = allocator;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Hello from kreuzberg-zig\n", .{});
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
19
docs/snippets/zig/getting-started/install_verify.md
Normal file
19
docs/snippets/zig/getting-started/install_verify.md
Normal file
@@ -0,0 +1,19 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
_ = allocator;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("kreuzberg module imported successfully\n", .{});
|
||||
if (kreuzberg._last_error()) |context| {
|
||||
try stdout.print(" last error context: {s}\n", .{context});
|
||||
} else {
|
||||
try stdout.print(" no prior FFI errors recorded\n", .{});
|
||||
}
|
||||
}
|
||||
```
|
||||
38
docs/snippets/zig/getting-started/read_content.md
Normal file
38
docs/snippets/zig/getting-started/read_content.md
Normal file
@@ -0,0 +1,38 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value.object;
|
||||
const content = root.get("content") orelse std.json.Value{ .string = "" };
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Extracted content: {s}\n", .{content.string});
|
||||
|
||||
if (root.get("tables")) |tables_value| {
|
||||
const tables = tables_value.array;
|
||||
try stdout.print("Tables found: {d}\n", .{tables.items.len});
|
||||
for (tables.items, 0..) |table, index| {
|
||||
const table_obj = table.object;
|
||||
if (table_obj.get("page_number")) |page_number| {
|
||||
try stdout.print(" table {d}: page {d}\n", .{ index, page_number.integer });
|
||||
} else {
|
||||
try stdout.print(" table {d}\n", .{index});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try stdout.print("Tables found: 0\n", .{});
|
||||
}
|
||||
}
|
||||
```
|
||||
30
docs/snippets/zig/llm/structured_extraction.md
Normal file
30
docs/snippets/zig/llm/structured_extraction.md
Normal file
@@ -0,0 +1,30 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Structured extraction is configured via the JSON `structured_extraction`
|
||||
// field on `ExtractionConfig`. The schema is a JSON Schema string and
|
||||
// `llm.model` selects the provider via liter-llm.
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "structured_extraction": {
|
||||
\\ "schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"},\"authors\":{\"type\":\"array\",\"items\":{\"type\":\"string\"}},\"date\":{\"type\":\"string\"}},\"required\":[\"title\",\"authors\",\"date\"],\"additionalProperties\":false}",
|
||||
\\ "schema_name": "Paper",
|
||||
\\ "strict": true,
|
||||
\\ "llm": {
|
||||
\\ "model": "openai/gpt-4o-mini"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
38
docs/snippets/zig/mcp/mcp_custom_client.md
Normal file
38
docs/snippets/zig/mcp/mcp_custom_client.md
Normal file
@@ -0,0 +1,38 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
|
||||
// The Zig binding does not expose an MCP client. To talk to the bundled
|
||||
// `kreuzberg mcp` server, spawn the CLI as a subprocess and exchange
|
||||
// JSON-RPC messages over stdin/stdout.
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
var child = std.process.Child.init(&.{ "kreuzberg", "mcp" }, allocator);
|
||||
child.stdin_behavior = .Pipe;
|
||||
child.stdout_behavior = .Pipe;
|
||||
child.stderr_behavior = .Inherit;
|
||||
try child.spawn();
|
||||
|
||||
const request =
|
||||
\\{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"extract_file","arguments":{"path":"document.pdf"}}}
|
||||
++ "\n";
|
||||
|
||||
if (child.stdin) |stdin| {
|
||||
try stdin.writeAll(request);
|
||||
stdin.close();
|
||||
child.stdin = null;
|
||||
}
|
||||
|
||||
if (child.stdout) |stdout| {
|
||||
const response = try stdout.reader().readAllAlloc(allocator, 16 * 1024 * 1024);
|
||||
defer allocator.free(response);
|
||||
try std.io.getStdOut().writer().print("{s}\n", .{response});
|
||||
}
|
||||
|
||||
_ = try child.wait();
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/mcp/mcp_server_start.md
Normal file
22
docs/snippets/zig/mcp/mcp_server_start.md
Normal file
@@ -0,0 +1,22 @@
|
||||
<!-- snippet:syntax-only -->
|
||||
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
|
||||
// The Zig binding does not expose the MCP server programmatically. Launch
|
||||
// the bundled `kreuzberg mcp` CLI as a subprocess to start the server.
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
var child = std.process.Child.init(&.{ "kreuzberg", "mcp" }, allocator);
|
||||
child.stdin_behavior = .Inherit;
|
||||
child.stdout_behavior = .Inherit;
|
||||
child.stderr_behavior = .Inherit;
|
||||
try child.spawn();
|
||||
|
||||
const term = try child.wait();
|
||||
try std.io.getStdOut().writer().print("kreuzberg mcp exited: {any}\n", .{term});
|
||||
}
|
||||
```
|
||||
47
docs/snippets/zig/metadata/language_detection.md
Normal file
47
docs/snippets/zig/metadata/language_detection.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.9,
|
||||
\\ "detect_multiple": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
if (root.object.get("content")) |content_val| {
|
||||
if (content_val == .string) {
|
||||
try stdout.print("content length: {d}\n", .{content_val.string.len});
|
||||
}
|
||||
}
|
||||
|
||||
if (root.object.get("detected_languages")) |languages_val| {
|
||||
if (languages_val == .array) {
|
||||
for (languages_val.array.items) |lang_val| {
|
||||
if (lang_val == .string) {
|
||||
try stdout.print("Detected: {s}\n", .{lang_val.string});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,43 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "language_detection": {
|
||||
\\ "enabled": true,
|
||||
\\ "min_confidence": 0.8,
|
||||
\\ "detect_multiple": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("multilingual_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
try stdout.print("Detected languages:", .{});
|
||||
if (root.object.get("detected_languages")) |languages_val| {
|
||||
if (languages_val == .array) {
|
||||
for (languages_val.array.items) |lang_val| {
|
||||
if (lang_val == .string) {
|
||||
try stdout.print(" {s}", .{lang_val.string});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try stdout.print("\n", .{});
|
||||
}
|
||||
```
|
||||
65
docs/snippets/zig/metadata/metadata.md
Normal file
65
docs/snippets/zig/metadata/metadata.md
Normal file
@@ -0,0 +1,65 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
if (root.object.get("metadata")) |metadata_val| {
|
||||
if (metadata_val != .object) return;
|
||||
const metadata = metadata_val.object;
|
||||
|
||||
if (metadata.get("title")) |title_val| {
|
||||
if (title_val == .string) {
|
||||
try stdout.print("Title: {s}\n", .{title_val.string});
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata.get("authors")) |authors_val| {
|
||||
if (authors_val == .array) {
|
||||
for (authors_val.array.items) |author| {
|
||||
if (author == .string) {
|
||||
try stdout.print("Author: {s}\n", .{author.string});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata.get("language")) |language_val| {
|
||||
if (language_val == .string) {
|
||||
try stdout.print("Language: {s}\n", .{language_val.string});
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata.get("created_at")) |created_val| {
|
||||
if (created_val == .string) {
|
||||
try stdout.print("Created: {s}\n", .{created_val.string});
|
||||
}
|
||||
}
|
||||
|
||||
if (metadata.get("pages")) |pages_val| {
|
||||
if (pages_val == .object) {
|
||||
if (pages_val.object.get("total_count")) |total_val| {
|
||||
if (total_val == .integer) {
|
||||
try stdout.print("Pages: {d}\n", .{total_val.integer});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
62
docs/snippets/zig/metadata/page_boundaries.md
Normal file
62
docs/snippets/zig/metadata/page_boundaries.md
Normal file
@@ -0,0 +1,62 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const content_val = root.object.get("content") orelse return;
|
||||
if (content_val != .string) return;
|
||||
const content = content_val.string;
|
||||
|
||||
const metadata_val = root.object.get("metadata") orelse return;
|
||||
if (metadata_val != .object) return;
|
||||
|
||||
const pages_val = metadata_val.object.get("pages") orelse return;
|
||||
if (pages_val != .object) return;
|
||||
|
||||
const boundaries_val = pages_val.object.get("boundaries") orelse return;
|
||||
if (boundaries_val != .array) return;
|
||||
|
||||
var index: usize = 0;
|
||||
for (boundaries_val.array.items) |boundary| {
|
||||
if (index >= 3) break;
|
||||
if (boundary != .object) continue;
|
||||
|
||||
const byte_start_val = boundary.object.get("byte_start") orelse continue;
|
||||
const byte_end_val = boundary.object.get("byte_end") orelse continue;
|
||||
const page_number_val = boundary.object.get("page_number") orelse continue;
|
||||
|
||||
if (byte_start_val != .integer or byte_end_val != .integer or page_number_val != .integer) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const byte_start: usize = @intCast(byte_start_val.integer);
|
||||
const byte_end: usize = @intCast(byte_end_val.integer);
|
||||
const page_number = page_number_val.integer;
|
||||
|
||||
const page_text = content[byte_start..byte_end];
|
||||
const preview_end = @min(@as(usize, 100), page_text.len);
|
||||
|
||||
try stdout.print("Page {d}:\n", .{page_number});
|
||||
try stdout.print(" Byte range: {d}-{d}\n", .{ byte_start, byte_end });
|
||||
try stdout.print(" Preview: {s}...\n", .{page_text[0..preview_end]});
|
||||
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
```
|
||||
59
docs/snippets/zig/metadata/page_tracking_basic.md
Normal file
59
docs/snippets/zig/metadata/page_tracking_basic.md
Normal file
@@ -0,0 +1,59 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "pages": {
|
||||
\\ "extract_pages": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const pages_val = root.object.get("pages") orelse return;
|
||||
if (pages_val != .array) return;
|
||||
|
||||
for (pages_val.array.items) |page| {
|
||||
if (page != .object) continue;
|
||||
|
||||
const page_number_val = page.object.get("page_number") orelse continue;
|
||||
if (page_number_val != .integer) continue;
|
||||
|
||||
try stdout.print("Page {d}:\n", .{page_number_val.integer});
|
||||
|
||||
if (page.object.get("content")) |content_val| {
|
||||
if (content_val == .string) {
|
||||
try stdout.print(" Content: {d} chars\n", .{content_val.string.len});
|
||||
}
|
||||
}
|
||||
|
||||
if (page.object.get("tables")) |tables_val| {
|
||||
if (tables_val == .array) {
|
||||
try stdout.print(" Tables: {d}\n", .{tables_val.array.items.len});
|
||||
}
|
||||
}
|
||||
|
||||
if (page.object.get("images")) |images_val| {
|
||||
if (images_val == .array) {
|
||||
try stdout.print(" Images: {d}\n", .{images_val.array.items.len});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
58
docs/snippets/zig/metadata/tables.md
Normal file
58
docs/snippets/zig/metadata/tables.md
Normal file
@@ -0,0 +1,58 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json = "{}";
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const tables_val = root.object.get("tables") orelse return;
|
||||
if (tables_val != .array) return;
|
||||
|
||||
for (tables_val.array.items) |table| {
|
||||
if (table != .object) continue;
|
||||
|
||||
if (table.object.get("cells")) |cells_val| {
|
||||
if (cells_val == .array) {
|
||||
try stdout.print("Table with {d} rows\n", .{cells_val.array.items.len});
|
||||
|
||||
for (cells_val.array.items) |row_val| {
|
||||
if (row_val != .array) continue;
|
||||
try stdout.print(" Row:", .{});
|
||||
for (row_val.array.items) |cell_val| {
|
||||
if (cell_val == .string) {
|
||||
try stdout.print(" [{s}]", .{cell_val.string});
|
||||
}
|
||||
}
|
||||
try stdout.print("\n", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (table.object.get("markdown")) |markdown_val| {
|
||||
if (markdown_val == .string) {
|
||||
try stdout.print("{s}\n", .{markdown_val.string});
|
||||
}
|
||||
}
|
||||
|
||||
if (table.object.get("page_number")) |page_val| {
|
||||
if (page_val == .integer) {
|
||||
try stdout.print("Page: {d}\n", .{page_val.integer});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
47
docs/snippets/zig/metadata/vector_database_integration.md
Normal file
47
docs/snippets/zig/metadata/vector_database_integration.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 512,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "model": "balanced",
|
||||
\\ "normalize": true
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const chunks_val = root.object.get("chunks") orelse return;
|
||||
if (chunks_val != .array) return;
|
||||
|
||||
for (chunks_val.array.items, 0..) |chunk, index| {
|
||||
if (chunk != .object) continue;
|
||||
|
||||
const embedding_val = chunk.object.get("embedding") orelse continue;
|
||||
if (embedding_val != .array) continue;
|
||||
|
||||
try stdout.print("Chunk {d}: {d} dimensions\n", .{ index, embedding_val.array.items.len });
|
||||
// Store embedding_val.array.items in vector database
|
||||
}
|
||||
}
|
||||
```
|
||||
30
docs/snippets/zig/ocr/cloud_ocr_backend.md
Normal file
30
docs/snippets/zig/ocr/cloud_ocr_backend.md
Normal file
@@ -0,0 +1,30 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
// Cloud OCR backends are registered as custom plugins via the Rust core.
|
||||
// From Zig, select a registered cloud backend by name through OcrConfig.
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "cloud-ocr",
|
||||
\\ "language": "eng"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
27
docs/snippets/zig/ocr/image_extraction.md
Normal file
27
docs/snippets/zig/ocr/image_extraction.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "images": {
|
||||
\\ "extract_images": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
32
docs/snippets/zig/ocr/image_preprocessing.md
Normal file
32
docs/snippets/zig/ocr/image_preprocessing.md
Normal file
@@ -0,0 +1,32 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "images": {
|
||||
\\ "extract_images": true,
|
||||
\\ "target_dpi": 300,
|
||||
\\ "max_image_dimension": 4096,
|
||||
\\ "auto_adjust_dpi": true,
|
||||
\\ "min_dpi": 150,
|
||||
\\ "max_dpi": 600
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/ocr/ocr_easyocr.md
Normal file
28
docs/snippets/zig/ocr/ocr_easyocr.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "easyocr",
|
||||
\\ "language": "en"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
49
docs/snippets/zig/ocr/ocr_elements.md
Normal file
49
docs/snippets/zig/ocr/ocr_elements.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "paddleocr",
|
||||
\\ "language": "en",
|
||||
\\ "element_config": {
|
||||
\\ "include_elements": true
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, owned, .{});
|
||||
defer parsed.deinit();
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
|
||||
const root = parsed.value;
|
||||
if (root != .object) return;
|
||||
|
||||
if (root.object.get("ocr_elements")) |elements_val| {
|
||||
if (elements_val == .array) {
|
||||
for (elements_val.array.items) |element| {
|
||||
if (element != .object) continue;
|
||||
if (element.object.get("text")) |text_val| {
|
||||
if (text_val == .string) {
|
||||
try stdout.print("Text: {s}\n", .{text_val.string});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/ocr/ocr_extraction.md
Normal file
28
docs/snippets/zig/ocr/ocr_extraction.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
29
docs/snippets/zig/ocr/ocr_force_all_pages.md
Normal file
29
docs/snippets/zig/ocr/ocr_force_all_pages.md
Normal file
@@ -0,0 +1,29 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "force_ocr": true,
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/ocr/ocr_multi_language.md
Normal file
28
docs/snippets/zig/ocr/ocr_multi_language.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "tesseract",
|
||||
\\ "language": "eng+deu+fra"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("multilingual.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/ocr/ocr_paddleocr.md
Normal file
28
docs/snippets/zig/ocr/ocr_paddleocr.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||||
defer _ = gpa.deinit();
|
||||
const allocator = gpa.allocator();
|
||||
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "ocr": {
|
||||
\\ "backend": "paddleocr",
|
||||
\\ "language": "en"
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const owned = try allocator.dupe(u8, result_json);
|
||||
defer allocator.free(owned);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{owned});
|
||||
}
|
||||
```
|
||||
13
docs/snippets/zig/plugins/clear_plugins.md
Normal file
13
docs/snippets/zig/plugins/clear_plugins.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
try kreuzberg.clear_ocr_backends();
|
||||
try kreuzberg.clear_post_processors();
|
||||
try kreuzberg.clear_validators();
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("All plugins cleared\n", .{});
|
||||
}
|
||||
```
|
||||
41
docs/snippets/zig/plugins/embedding_backend.md
Normal file
41
docs/snippets/zig/plugins/embedding_backend.md
Normal file
@@ -0,0 +1,41 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const MyEmbedder = struct {
|
||||
pub fn dimensions(self: *MyEmbedder) usize {
|
||||
_ = self;
|
||||
return 768;
|
||||
}
|
||||
|
||||
pub fn embed(self: *MyEmbedder, texts: [*c]const u8) ![]u8 {
|
||||
_ = self;
|
||||
_ = texts;
|
||||
// `texts` is a JSON-encoded array of strings. Return a JSON-encoded
|
||||
// 2D float array of shape [n_texts, dimensions]; the dispatcher
|
||||
// validates the shape on the Rust side.
|
||||
return error.Plugin;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = MyEmbedder{};
|
||||
var vtable = kreuzberg.make_embedding_backend_vtable(MyEmbedder, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("my-embedder");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_embedding_backend("my-embedder", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
175
docs/snippets/zig/plugins/extractor_registration.md
Normal file
175
docs/snippets/zig/plugins/extractor_registration.md
Normal file
@@ -0,0 +1,175 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// VTable struct for DocumentExtractor; mirrors KreuzbergDocumentExtractorVTable.
|
||||
const DocumentExtractorVTable = extern struct {
|
||||
name_fn: ?*const fn (user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void,
|
||||
version_fn: ?*const fn (user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void,
|
||||
initialize_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
||||
shutdown_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
||||
extract_bytes: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
content: [*c]const u8,
|
||||
content_len: usize,
|
||||
mime_type: [*c]const u8,
|
||||
config: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32,
|
||||
extract_file: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
path: [*c]const u8,
|
||||
mime_type: [*c]const u8,
|
||||
config: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32,
|
||||
supported_mime_types: ?*const fn (user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32,
|
||||
priority: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
||||
can_handle: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
path: [*c]const u8,
|
||||
mime_type: [*c]const u8,
|
||||
) callconv(.C) i32,
|
||||
as_sync_extractor: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
||||
free_user_data: ?*const fn (user_data: ?*anyopaque) callconv(.C) void,
|
||||
};
|
||||
|
||||
extern "kreuzberg_ffi" fn kreuzberg_register_document_extractor(
|
||||
name: [*c]const u8,
|
||||
vtable: DocumentExtractorVTable,
|
||||
user_data: ?*anyopaque,
|
||||
out_error: ?*?[*c]u8,
|
||||
) i32;
|
||||
|
||||
extern "kreuzberg_ffi" fn kreuzberg_unregister_document_extractor(
|
||||
name: [*c]const u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) i32;
|
||||
|
||||
extern "kreuzberg_ffi" fn kreuzberg_free_string(ptr: [*c]u8) void;
|
||||
|
||||
// Implement callback functions for the extractor.
|
||||
fn extract_bytes_fn(
|
||||
user_data: ?*anyopaque,
|
||||
content: [*c]const u8,
|
||||
content_len: usize,
|
||||
mime_type: [*c]const u8,
|
||||
config: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = content;
|
||||
_ = content_len;
|
||||
_ = config;
|
||||
|
||||
const mime_str = std.mem.sliceTo(mime_type, 0);
|
||||
if (std.mem.eql(u8, mime_str, "application/json")) {
|
||||
const result = "{\"content\": \"Extracted from JSON\"}";
|
||||
const result_cstr = std.heap.c_allocator.allocSentinel(u8, result.len, 0) catch return 1;
|
||||
@memcpy(result_cstr[0..result.len], result);
|
||||
if (out_result) |ptr| ptr.* = result_cstr.ptr;
|
||||
return 0;
|
||||
}
|
||||
if (out_error) |ptr| {
|
||||
const err_msg = "Unsupported MIME type";
|
||||
const err_cstr = std.heap.c_allocator.allocSentinel(u8, err_msg.len, 0) catch return 1;
|
||||
@memcpy(err_cstr[0..err_msg.len], err_msg);
|
||||
ptr.* = err_cstr.ptr;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
fn supported_mime_types_fn(user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
const mime_types = "[\"application/json\"]";
|
||||
const cstr = std.heap.c_allocator.allocSentinel(u8, mime_types.len, 0) catch return 1;
|
||||
@memcpy(cstr[0..mime_types.len], mime_types);
|
||||
if (out_result) |ptr| ptr.* = cstr.ptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn name_fn(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
const name = "zig-json-extractor";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, name.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..name.len], name);
|
||||
if (out_name) |ptr| ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
fn version_fn(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
const version = "0.1.0";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, version.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..version.len], version);
|
||||
if (out_version) |ptr| ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
fn initialize_fn(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
return 0; // Success
|
||||
}
|
||||
|
||||
fn shutdown_fn(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
return 0; // Success
|
||||
}
|
||||
|
||||
fn priority_fn(user_data: ?*anyopaque) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
return 50; // Default priority
|
||||
}
|
||||
|
||||
pub fn main() !void {
|
||||
var out_error: ?[*c]u8 = null;
|
||||
defer if (out_error) |ptr| kreuzberg_free_string(ptr);
|
||||
|
||||
// Build the vtable.
|
||||
const vtable = DocumentExtractorVTable{
|
||||
.name_fn = name_fn,
|
||||
.version_fn = version_fn,
|
||||
.initialize_fn = initialize_fn,
|
||||
.shutdown_fn = shutdown_fn,
|
||||
.extract_bytes = extract_bytes_fn,
|
||||
.extract_file = null,
|
||||
.supported_mime_types = supported_mime_types_fn,
|
||||
.priority = priority_fn,
|
||||
.can_handle = null,
|
||||
.as_sync_extractor = null,
|
||||
.free_user_data = null,
|
||||
};
|
||||
|
||||
// Register the extractor with null user_data (no state).
|
||||
const register_rc = kreuzberg_register_document_extractor(
|
||||
"zig-json-extractor",
|
||||
vtable,
|
||||
null,
|
||||
&out_error,
|
||||
);
|
||||
|
||||
if (register_rc != 0) {
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
if (out_error) |err_ptr| {
|
||||
const err_msg = std.mem.sliceTo(err_ptr, 0);
|
||||
try stdout.print("Registration failed: {s}\n", .{err_msg});
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("Successfully registered zig-json-extractor\n", .{});
|
||||
|
||||
// Unregister the extractor when done.
|
||||
out_error = null;
|
||||
const unregister_rc = kreuzberg_unregister_document_extractor("zig-json-extractor", &out_error);
|
||||
if (unregister_rc == 0) {
|
||||
try stdout.print("Successfully unregistered zig-json-extractor\n", .{});
|
||||
}
|
||||
}
|
||||
```
|
||||
20
docs/snippets/zig/plugins/list_plugins.md
Normal file
20
docs/snippets/zig/plugins/list_plugins.md
Normal file
@@ -0,0 +1,20 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const ocr_backends = try kreuzberg.list_ocr_backends();
|
||||
defer std.heap.c_allocator.free(ocr_backends);
|
||||
|
||||
const post_processors = try kreuzberg.list_post_processors();
|
||||
defer std.heap.c_allocator.free(post_processors);
|
||||
|
||||
const validators = try kreuzberg.list_validators();
|
||||
defer std.heap.c_allocator.free(validators);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("OCR backends: {s}\n", .{ocr_backends});
|
||||
try stdout.print("Post-processors: {s}\n", .{post_processors});
|
||||
try stdout.print("Validators: {s}\n", .{validators});
|
||||
}
|
||||
```
|
||||
47
docs/snippets/zig/plugins/min_length_validator.md
Normal file
47
docs/snippets/zig/plugins/min_length_validator.md
Normal file
@@ -0,0 +1,47 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const MinLengthValidator = struct {
|
||||
min_length: usize,
|
||||
|
||||
pub fn validate(self: *MinLengthValidator, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = config;
|
||||
const slice = std.mem.sliceTo(result, 0);
|
||||
if (slice.len < self.min_length) return error.Validation;
|
||||
}
|
||||
|
||||
pub fn should_validate(self: *MinLengthValidator, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *MinLengthValidator) i32 {
|
||||
_ = self;
|
||||
return 100;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = MinLengthValidator{ .min_length = 50 };
|
||||
var vtable = kreuzberg.make_validator_vtable(MinLengthValidator, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("min-length-validator");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_validator("min-length-validator", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
60
docs/snippets/zig/plugins/pdf_metadata_extractor.md
Normal file
60
docs/snippets/zig/plugins/pdf_metadata_extractor.md
Normal file
@@ -0,0 +1,60 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const PdfMetadataExtractor = struct {
|
||||
processed_count: u64 = 0,
|
||||
|
||||
pub fn process(self: *PdfMetadataExtractor, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = result;
|
||||
_ = config;
|
||||
self.processed_count += 1;
|
||||
// Parse the incoming JSON result, append PDF-specific metadata fields,
|
||||
// and forward the enriched payload onward.
|
||||
}
|
||||
|
||||
pub fn processing_stage(self: *PdfMetadataExtractor) [*c]const u8 {
|
||||
_ = self;
|
||||
return "Early";
|
||||
}
|
||||
|
||||
pub fn should_process(self: *PdfMetadataExtractor, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = config;
|
||||
const slice = std.mem.sliceTo(result, 0);
|
||||
return if (std.mem.indexOf(u8, slice, "\"mime_type\":\"application/pdf\"") != null) 1 else 0;
|
||||
}
|
||||
|
||||
pub fn estimated_duration_ms(self: *PdfMetadataExtractor, result: [*c]const u8) u64 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
return 2;
|
||||
}
|
||||
|
||||
pub fn priority(self: *PdfMetadataExtractor) i32 {
|
||||
_ = self;
|
||||
return 80;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = PdfMetadataExtractor{};
|
||||
var vtable = kreuzberg.make_post_processor_vtable(PdfMetadataExtractor, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("pdf-metadata-extractor");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_post_processor("pdf-metadata-extractor", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
58
docs/snippets/zig/plugins/pdf_only_processor.md
Normal file
58
docs/snippets/zig/plugins/pdf_only_processor.md
Normal file
@@ -0,0 +1,58 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const PdfOnlyProcessor = struct {
|
||||
pub fn process(self: *PdfOnlyProcessor, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
// PDF-specific transforms go here. Parse the JSON result, mutate
|
||||
// metadata/content, and forward through the pipeline.
|
||||
}
|
||||
|
||||
pub fn processing_stage(self: *PdfOnlyProcessor) [*c]const u8 {
|
||||
_ = self;
|
||||
return "Middle";
|
||||
}
|
||||
|
||||
pub fn should_process(self: *PdfOnlyProcessor, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = config;
|
||||
const slice = std.mem.sliceTo(result, 0);
|
||||
return if (std.mem.indexOf(u8, slice, "\"mime_type\":\"application/pdf\"") != null) 1 else 0;
|
||||
}
|
||||
|
||||
pub fn estimated_duration_ms(self: *PdfOnlyProcessor, result: [*c]const u8) u64 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
return 5;
|
||||
}
|
||||
|
||||
pub fn priority(self: *PdfOnlyProcessor) i32 {
|
||||
_ = self;
|
||||
return 70;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = PdfOnlyProcessor{};
|
||||
var vtable = kreuzberg.make_post_processor_vtable(PdfOnlyProcessor, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("pdf-only-processor");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_post_processor("pdf-only-processor", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
197
docs/snippets/zig/plugins/plugin_extractor.md
Normal file
197
docs/snippets/zig/plugins/plugin_extractor.md
Normal file
@@ -0,0 +1,197 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Mirrors KreuzbergDocumentExtractorVTable from the C FFI.
|
||||
const DocumentExtractorVTable = extern struct {
|
||||
name_fn: ?*const fn (user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void,
|
||||
version_fn: ?*const fn (user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void,
|
||||
initialize_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
||||
shutdown_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
||||
extract_bytes: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
content: [*c]const u8,
|
||||
content_len: usize,
|
||||
mime_type: [*c]const u8,
|
||||
config: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32,
|
||||
extract_file: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
path: [*c]const u8,
|
||||
mime_type: [*c]const u8,
|
||||
config: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32,
|
||||
supported_mime_types: ?*const fn (user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32,
|
||||
priority: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
||||
can_handle: ?*const fn (
|
||||
user_data: ?*anyopaque,
|
||||
path: [*c]const u8,
|
||||
mime_type: [*c]const u8,
|
||||
) callconv(.C) i32,
|
||||
as_sync_extractor: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
||||
free_user_data: ?*const fn (user_data: ?*anyopaque) callconv(.C) void,
|
||||
};
|
||||
|
||||
// Simple state struct for the extractor instance.
|
||||
const SimpleExtractorState = struct {
|
||||
source_format: [:0]const u8,
|
||||
supported_mimes: [:0]const u8,
|
||||
};
|
||||
|
||||
extern "kreuzberg_ffi" fn kreuzberg_register_document_extractor(
|
||||
name: [*c]const u8,
|
||||
vtable: DocumentExtractorVTable,
|
||||
user_data: ?*anyopaque,
|
||||
out_error: ?*?[*c]u8,
|
||||
) i32;
|
||||
|
||||
extern "kreuzberg_ffi" fn kreuzberg_free_string(ptr: [*c]u8) void;
|
||||
|
||||
// Callbacks for the custom extractor.
|
||||
fn extract_bytes_impl(
|
||||
user_data: ?*anyopaque,
|
||||
content: [*c]const u8,
|
||||
content_len: usize,
|
||||
_: [*c]const u8,
|
||||
_: [*c]const u8,
|
||||
out_result: ?*?[*c]u8,
|
||||
out_error: ?*?[*c]u8,
|
||||
) callconv(.C) i32 {
|
||||
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
||||
_ = state;
|
||||
|
||||
// Minimal extraction: wrap content in JSON.
|
||||
var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
|
||||
defer arena.deinit();
|
||||
const allocator = arena.allocator();
|
||||
|
||||
const content_slice = content[0..content_len];
|
||||
const result = std.fmt.allocPrint(
|
||||
allocator,
|
||||
"{{\"content\": \"{s}\", \"mime_type\": \"application/octet-stream\"}}",
|
||||
.{content_slice},
|
||||
) catch {
|
||||
if (out_error) |ptr| {
|
||||
const err = "OOM during extraction";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..err.len], err);
|
||||
ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
};
|
||||
|
||||
const result_cstr = std.heap.c_allocator.allocSentinel(u8, result.len, 0) catch {
|
||||
if (out_error) |ptr| {
|
||||
const err = "OOM allocating result";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..err.len], err);
|
||||
ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
};
|
||||
@memcpy(result_cstr[0..result.len], result);
|
||||
|
||||
if (out_result) |ptr| ptr.* = result_cstr.ptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn supported_mimes_impl(user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32 {
|
||||
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
||||
const mimes = state.supported_mimes;
|
||||
const mimes_cstr = std.heap.c_allocator.allocSentinel(u8, mimes.len, 0) catch return 1;
|
||||
@memcpy(mimes_cstr[0..mimes.len], mimes);
|
||||
if (out_result) |ptr| ptr.* = mimes_cstr.ptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn name_impl(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
const name = "zig-simple-extractor";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, name.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..name.len], name);
|
||||
if (out_name) |ptr| ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
fn version_impl(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
const version = "0.1.0";
|
||||
if (std.heap.c_allocator.allocSentinel(u8, version.len, 0)) |cstr| {
|
||||
@memcpy(cstr[0..version.len], version);
|
||||
if (out_version) |ptr| ptr.* = cstr.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
fn init_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn shutdown_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
fn priority_impl(user_data: ?*anyopaque) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
return 60; // Higher than default
|
||||
}
|
||||
|
||||
fn cleanup_state(user_data: ?*anyopaque) callconv(.C) void {
|
||||
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
||||
std.heap.c_allocator.free(state.supported_mimes);
|
||||
std.heap.c_allocator.destroy(state);
|
||||
}
|
||||
|
||||
pub fn main() !void {
|
||||
// Create extractor state on the heap.
|
||||
const state = try std.heap.c_allocator.create(SimpleExtractorState);
|
||||
state.source_format = try std.heap.c_allocator.dupeZ(u8, "custom");
|
||||
state.supported_mimes = try std.heap.c_allocator.dupeZ(u8, "[\"application/octet-stream\"]");
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
defer if (out_error) |ptr| kreuzberg_free_string(ptr);
|
||||
|
||||
// Build and register the vtable.
|
||||
const vtable = DocumentExtractorVTable{
|
||||
.name_fn = name_impl,
|
||||
.version_fn = version_impl,
|
||||
.initialize_fn = init_impl,
|
||||
.shutdown_fn = shutdown_impl,
|
||||
.extract_bytes = extract_bytes_impl,
|
||||
.extract_file = null,
|
||||
.supported_mime_types = supported_mimes_impl,
|
||||
.priority = priority_impl,
|
||||
.can_handle = null,
|
||||
.as_sync_extractor = null,
|
||||
.free_user_data = cleanup_state,
|
||||
};
|
||||
|
||||
const rc = kreuzberg_register_document_extractor(
|
||||
"zig-simple-extractor",
|
||||
vtable,
|
||||
state,
|
||||
&out_error,
|
||||
);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
if (rc == 0) {
|
||||
try stdout.print("Registered zig-simple-extractor with custom state\n", .{});
|
||||
} else {
|
||||
if (out_error) |err_ptr| {
|
||||
const err_msg = std.mem.sliceTo(err_ptr, 0);
|
||||
try stdout.print("Registration failed: {s}\n", .{err_msg});
|
||||
} else {
|
||||
try stdout.print("Registration failed (no error message)\n", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
73
docs/snippets/zig/plugins/plugin_logging.md
Normal file
73
docs/snippets/zig/plugins/plugin_logging.md
Normal file
@@ -0,0 +1,73 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const LoggingProcessor = struct {
|
||||
pub fn process(self: *LoggingProcessor, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = config;
|
||||
const slice = std.mem.sliceTo(result, 0);
|
||||
std.log.info("post-processor invoked, payload bytes={d}", .{slice.len});
|
||||
}
|
||||
|
||||
pub fn processing_stage(self: *LoggingProcessor) [*c]const u8 {
|
||||
_ = self;
|
||||
return "Late";
|
||||
}
|
||||
|
||||
pub fn should_process(self: *LoggingProcessor, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn estimated_duration_ms(self: *LoggingProcessor, result: [*c]const u8) u64 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
return 0;
|
||||
}
|
||||
|
||||
pub fn priority(self: *LoggingProcessor) i32 {
|
||||
_ = self;
|
||||
return 10;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = LoggingProcessor{};
|
||||
var vtable = kreuzberg.make_post_processor_vtable(LoggingProcessor, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("logging-processor");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.initialize_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
std.log.info("logging-processor initialised", .{});
|
||||
return 0;
|
||||
}
|
||||
}.thunk;
|
||||
vtable.shutdown_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = user_data;
|
||||
_ = out_error;
|
||||
std.log.info("logging-processor shut down", .{});
|
||||
return 0;
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_post_processor("logging-processor", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
52
docs/snippets/zig/plugins/plugin_testing.md
Normal file
52
docs/snippets/zig/plugins/plugin_testing.md
Normal file
@@ -0,0 +1,52 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const testing = std.testing;
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const NoopValidator = struct {
|
||||
pub fn validate(self: *NoopValidator, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
}
|
||||
|
||||
pub fn should_validate(self: *NoopValidator, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *NoopValidator) i32 {
|
||||
_ = self;
|
||||
return 50;
|
||||
}
|
||||
};
|
||||
|
||||
test "register and unregister validator" {
|
||||
var instance = NoopValidator{};
|
||||
var vtable = kreuzberg.make_validator_vtable(NoopValidator, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("noop-validator");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
try testing.expectEqual(@as(i32, 0), kreuzberg.register_validator("noop-validator", vtable, &instance, &out_error));
|
||||
|
||||
const validators = try kreuzberg.list_validators();
|
||||
defer std.heap.c_allocator.free(validators);
|
||||
try testing.expect(std.mem.indexOf(u8, validators, "noop-validator") != null);
|
||||
|
||||
try testing.expectEqual(@as(i32, 0), kreuzberg.unregister_validator("noop-validator", &out_error));
|
||||
}
|
||||
```
|
||||
46
docs/snippets/zig/plugins/plugin_validator.md
Normal file
46
docs/snippets/zig/plugins/plugin_validator.md
Normal file
@@ -0,0 +1,46 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const ContentValidator = struct {
|
||||
pub fn validate(self: *ContentValidator, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = config;
|
||||
const slice = std.mem.sliceTo(result, 0);
|
||||
if (slice.len == 0) return error.Validation;
|
||||
}
|
||||
|
||||
pub fn should_validate(self: *ContentValidator, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *ContentValidator) i32 {
|
||||
_ = self;
|
||||
return 50;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = ContentValidator{};
|
||||
var vtable = kreuzberg.make_validator_vtable(ContentValidator, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("content-validator");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_validator("content-validator", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
49
docs/snippets/zig/plugins/quality_score_validator.md
Normal file
49
docs/snippets/zig/plugins/quality_score_validator.md
Normal file
@@ -0,0 +1,49 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const QualityScoreValidator = struct {
|
||||
threshold: f64,
|
||||
|
||||
pub fn validate(self: *QualityScoreValidator, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
// Parse `result` JSON, look up `metadata.additional.quality_score`,
|
||||
// and return error.Validation if it falls below `self.threshold`.
|
||||
}
|
||||
|
||||
pub fn should_validate(self: *QualityScoreValidator, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *QualityScoreValidator) i32 {
|
||||
_ = self;
|
||||
return 75;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = QualityScoreValidator{ .threshold = 0.5 };
|
||||
var vtable = kreuzberg.make_validator_vtable(QualityScoreValidator, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("quality-score-validator");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_validator("quality-score-validator", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
67
docs/snippets/zig/plugins/stateful_plugin.md
Normal file
67
docs/snippets/zig/plugins/stateful_plugin.md
Normal file
@@ -0,0 +1,67 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const StatefulProcessor = struct {
|
||||
call_count: std.atomic.Value(usize) = std.atomic.Value(usize).init(0),
|
||||
|
||||
pub fn process(self: *StatefulProcessor, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = result;
|
||||
_ = config;
|
||||
_ = self.call_count.fetchAdd(1, .acq_rel);
|
||||
}
|
||||
|
||||
pub fn processing_stage(self: *StatefulProcessor) [*c]const u8 {
|
||||
_ = self;
|
||||
return "Middle";
|
||||
}
|
||||
|
||||
pub fn should_process(self: *StatefulProcessor, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn estimated_duration_ms(self: *StatefulProcessor, result: [*c]const u8) u64 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *StatefulProcessor) i32 {
|
||||
_ = self;
|
||||
return 50;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = StatefulProcessor{};
|
||||
var vtable = kreuzberg.make_post_processor_vtable(StatefulProcessor, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("stateful-processor");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.shutdown_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
||||
_ = out_error;
|
||||
const self: *StatefulProcessor = @ptrCast(@alignCast(user_data));
|
||||
const count = self.call_count.load(.acquire);
|
||||
std.log.info("stateful-processor invoked {d} times", .{count});
|
||||
return 0;
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_post_processor("stateful-processor", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
13
docs/snippets/zig/plugins/unregister_plugins.md
Normal file
13
docs/snippets/zig/plugins/unregister_plugins.md
Normal file
@@ -0,0 +1,13 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
var out_error: ?[*c]u8 = null;
|
||||
|
||||
_ = kreuzberg.unregister_post_processor("word-count", &out_error);
|
||||
_ = kreuzberg.unregister_validator("min-length-validator", &out_error);
|
||||
_ = kreuzberg.unregister_ocr_backend("custom-ocr", &out_error);
|
||||
_ = kreuzberg.unregister_embedding_backend("my-embedder", &out_error);
|
||||
}
|
||||
```
|
||||
58
docs/snippets/zig/plugins/word_count_processor.md
Normal file
58
docs/snippets/zig/plugins/word_count_processor.md
Normal file
@@ -0,0 +1,58 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
const WordCountProcessor = struct {
|
||||
pub fn process(self: *WordCountProcessor, result: [*c]const u8, config: [*c]const u8) !void {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
// The serialized result/config arrive as JSON strings; modify and emit
|
||||
// an updated payload through your own pipeline as needed.
|
||||
}
|
||||
|
||||
pub fn processing_stage(self: *WordCountProcessor) [*c]const u8 {
|
||||
_ = self;
|
||||
return "Early";
|
||||
}
|
||||
|
||||
pub fn should_process(self: *WordCountProcessor, result: [*c]const u8, config: [*c]const u8) i32 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
_ = config;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn estimated_duration_ms(self: *WordCountProcessor, result: [*c]const u8) u64 {
|
||||
_ = self;
|
||||
_ = result;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub fn priority(self: *WordCountProcessor) i32 {
|
||||
_ = self;
|
||||
return 50;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn main() !void {
|
||||
var instance = WordCountProcessor{};
|
||||
var vtable = kreuzberg.make_post_processor_vtable(WordCountProcessor, &instance);
|
||||
|
||||
vtable.name_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_name) |ptr| ptr.* = @constCast("word-count");
|
||||
}
|
||||
}.thunk;
|
||||
vtable.version_fn = struct {
|
||||
fn thunk(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
||||
_ = user_data;
|
||||
if (out_version) |ptr| ptr.* = @constCast("1.0.0");
|
||||
}
|
||||
}.thunk;
|
||||
|
||||
var out_error: ?[*c]u8 = null;
|
||||
_ = kreuzberg.register_post_processor("word-count", vtable, &instance, &out_error);
|
||||
}
|
||||
```
|
||||
21
docs/snippets/zig/utils/chunking.md
Normal file
21
docs/snippets/zig/utils/chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1500,
|
||||
\\ "overlap": 200
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
27
docs/snippets/zig/utils/chunking_rag.md
Normal file
27
docs/snippets/zig/utils/chunking_rag.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Chunking + embeddings produces RAG-ready output. Each chunk in the
|
||||
// returned JSON carries `content`, position metadata, and (when an
|
||||
// embedding preset is configured) an `embedding` vector.
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 500,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
24
docs/snippets/zig/utils/embedding_with_chunking.md
Normal file
24
docs/snippets/zig/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1024,
|
||||
\\ "overlap": 100,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/utils/keyword_extraction_example.md
Normal file
22
docs/snippets/zig/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.3
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
18
docs/snippets/zig/utils/quality_processing_example.md
Normal file
18
docs/snippets/zig/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "enable_quality_processing": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
26
docs/snippets/zig/utils/standalone_embed.md
Normal file
26
docs/snippets/zig/utils/standalone_embed.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// `embed_texts` takes JSON-encoded inputs across the FFI boundary:
|
||||
// - `texts`: a JSON array of strings
|
||||
// - `config`: a JSON-encoded `EmbeddingConfig`
|
||||
// It returns a JSON-encoded 2D float array (one row per input text).
|
||||
pub fn main() !void {
|
||||
const texts_json =
|
||||
\\["Hello, world!", "Kreuzberg is fast"]
|
||||
;
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "normalize": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const embeddings_json = try kreuzberg.embed_texts(texts_json, config_json);
|
||||
defer std.heap.c_allocator.free(embeddings_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{embeddings_json});
|
||||
}
|
||||
```
|
||||
21
docs/snippets/zig/utils/token_reduction.md
Normal file
21
docs/snippets/zig/utils/token_reduction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
21
docs/snippets/zig/utils/token_reduction_example.md
Normal file
21
docs/snippets/zig/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("verbose_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/utils/vector_database_integration.md
Normal file
28
docs/snippets/zig/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Configure chunking with embeddings — the resulting JSON has a `chunks`
|
||||
// array where each entry carries `content` and `embedding`. Insert those
|
||||
// into your vector store (Qdrant, pgvector, Pinecone, etc.) directly from
|
||||
// the parsed JSON.
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 512,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user