Nomad changes

2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions
--- a/docs/snippets/zig/utils/chunking.md
+++ b/docs/snippets/zig/utils/chunking.md
@@ -0,0 +1,21 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "chunking": {
+        \\    "max_characters": 1500,
+        \\    "overlap": 200
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/chunking_rag.md
+++ b/docs/snippets/zig/utils/chunking_rag.md
@@ -0,0 +1,27 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+// Chunking + embeddings produces RAG-ready output. Each chunk in the
+// returned JSON carries `content`, position metadata, and (when an
+// embedding preset is configured) an `embedding` vector.
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "chunking": {
+        \\    "max_characters": 500,
+        \\    "overlap": 50,
+        \\    "embedding": {
+        \\      "preset": "balanced"
+        \\    }
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/embedding_with_chunking.md
+++ b/docs/snippets/zig/utils/embedding_with_chunking.md
@@ -0,0 +1,24 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "chunking": {
+        \\    "max_characters": 1024,
+        \\    "overlap": 100,
+        \\    "embedding": {
+        \\      "preset": "balanced"
+        \\    }
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/keyword_extraction_example.md
+++ b/docs/snippets/zig/utils/keyword_extraction_example.md
@@ -0,0 +1,22 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "keywords": {
+        \\    "algorithm": "yake",
+        \\    "max_keywords": 10,
+        \\    "min_score": 0.3
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/quality_processing_example.md
+++ b/docs/snippets/zig/utils/quality_processing_example.md
@@ -0,0 +1,18 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "enable_quality_processing": true
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("scanned_document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/standalone_embed.md
+++ b/docs/snippets/zig/utils/standalone_embed.md
@@ -0,0 +1,26 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+// `embed_texts` takes JSON-encoded inputs across the FFI boundary:
+// - `texts`: a JSON array of strings
+// - `config`: a JSON-encoded `EmbeddingConfig`
+// It returns a JSON-encoded 2D float array (one row per input text).
+pub fn main() !void {
+    const texts_json =
+        \\["Hello, world!", "Kreuzberg is fast"]
+    ;
+    const config_json =
+        \\{
+        \\  "model": {"type": "preset", "name": "balanced"},
+        \\  "normalize": true
+        \\}
+    ;
+
+    const embeddings_json = try kreuzberg.embed_texts(texts_json, config_json);
+    defer std.heap.c_allocator.free(embeddings_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{embeddings_json});
+}
+```
--- a/docs/snippets/zig/utils/token_reduction.md
+++ b/docs/snippets/zig/utils/token_reduction.md
@@ -0,0 +1,21 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "token_reduction": {
+        \\    "mode": "moderate",
+        \\    "preserve_important_words": true
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/token_reduction_example.md
+++ b/docs/snippets/zig/utils/token_reduction_example.md
@@ -0,0 +1,21 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "token_reduction": {
+        \\    "mode": "moderate",
+        \\    "preserve_important_words": true
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("verbose_document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```
--- a/docs/snippets/zig/utils/vector_database_integration.md
+++ b/docs/snippets/zig/utils/vector_database_integration.md
@@ -0,0 +1,28 @@
+```zig title="Zig"
+const std = @import("std");
+const kreuzberg = @import("kreuzberg");
+
+// Configure chunking with embeddings — the resulting JSON has a `chunks`
+// array where each entry carries `content` and `embedding`. Insert those
+// into your vector store (Qdrant, pgvector, Pinecone, etc.) directly from
+// the parsed JSON.
+pub fn main() !void {
+    const config_json =
+        \\{
+        \\  "chunking": {
+        \\    "max_characters": 512,
+        \\    "overlap": 50,
+        \\    "embedding": {
+        \\      "preset": "balanced"
+        \\    }
+        \\  }
+        \\}
+    ;
+
+    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
+    defer std.heap.c_allocator.free(result_json);
+
+    const stdout = std.io.getStdOut().writer();
+    try stdout.print("{s}\n", .{result_json});
+}
+```