This commit is contained in:
21
docs/snippets/zig/utils/chunking.md
Normal file
21
docs/snippets/zig/utils/chunking.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1500,
|
||||
\\ "overlap": 200
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
27
docs/snippets/zig/utils/chunking_rag.md
Normal file
27
docs/snippets/zig/utils/chunking_rag.md
Normal file
@@ -0,0 +1,27 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Chunking + embeddings produces RAG-ready output. Each chunk in the
|
||||
// returned JSON carries `content`, position metadata, and (when an
|
||||
// embedding preset is configured) an `embedding` vector.
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 500,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
24
docs/snippets/zig/utils/embedding_with_chunking.md
Normal file
24
docs/snippets/zig/utils/embedding_with_chunking.md
Normal file
@@ -0,0 +1,24 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 1024,
|
||||
\\ "overlap": 100,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
22
docs/snippets/zig/utils/keyword_extraction_example.md
Normal file
22
docs/snippets/zig/utils/keyword_extraction_example.md
Normal file
@@ -0,0 +1,22 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "keywords": {
|
||||
\\ "algorithm": "yake",
|
||||
\\ "max_keywords": 10,
|
||||
\\ "min_score": 0.3
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("research_paper.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
18
docs/snippets/zig/utils/quality_processing_example.md
Normal file
18
docs/snippets/zig/utils/quality_processing_example.md
Normal file
@@ -0,0 +1,18 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "enable_quality_processing": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("scanned_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
26
docs/snippets/zig/utils/standalone_embed.md
Normal file
26
docs/snippets/zig/utils/standalone_embed.md
Normal file
@@ -0,0 +1,26 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// `embed_texts` takes JSON-encoded inputs across the FFI boundary:
|
||||
// - `texts`: a JSON array of strings
|
||||
// - `config`: a JSON-encoded `EmbeddingConfig`
|
||||
// It returns a JSON-encoded 2D float array (one row per input text).
|
||||
pub fn main() !void {
|
||||
const texts_json =
|
||||
\\["Hello, world!", "Kreuzberg is fast"]
|
||||
;
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "model": {"type": "preset", "name": "balanced"},
|
||||
\\ "normalize": true
|
||||
\\}
|
||||
;
|
||||
|
||||
const embeddings_json = try kreuzberg.embed_texts(texts_json, config_json);
|
||||
defer std.heap.c_allocator.free(embeddings_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{embeddings_json});
|
||||
}
|
||||
```
|
||||
21
docs/snippets/zig/utils/token_reduction.md
Normal file
21
docs/snippets/zig/utils/token_reduction.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
21
docs/snippets/zig/utils/token_reduction_example.md
Normal file
21
docs/snippets/zig/utils/token_reduction_example.md
Normal file
@@ -0,0 +1,21 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "token_reduction": {
|
||||
\\ "mode": "moderate",
|
||||
\\ "preserve_important_words": true
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("verbose_document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
28
docs/snippets/zig/utils/vector_database_integration.md
Normal file
28
docs/snippets/zig/utils/vector_database_integration.md
Normal file
@@ -0,0 +1,28 @@
|
||||
```zig title="Zig"
|
||||
const std = @import("std");
|
||||
const kreuzberg = @import("kreuzberg");
|
||||
|
||||
// Configure chunking with embeddings — the resulting JSON has a `chunks`
|
||||
// array where each entry carries `content` and `embedding`. Insert those
|
||||
// into your vector store (Qdrant, pgvector, Pinecone, etc.) directly from
|
||||
// the parsed JSON.
|
||||
pub fn main() !void {
|
||||
const config_json =
|
||||
\\{
|
||||
\\ "chunking": {
|
||||
\\ "max_characters": 512,
|
||||
\\ "overlap": 50,
|
||||
\\ "embedding": {
|
||||
\\ "preset": "balanced"
|
||||
\\ }
|
||||
\\ }
|
||||
\\}
|
||||
;
|
||||
|
||||
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
|
||||
defer std.heap.c_allocator.free(result_json);
|
||||
|
||||
const stdout = std.io.getStdOut().writer();
|
||||
try stdout.print("{s}\n", .{result_json});
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user