Nomad changes
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s

This commit is contained in:
Henrik Jess Nielsen
2026-06-01 23:40:55 +02:00
parent 72b1a0a6ed
commit b4c07d3693
5723 changed files with 1130655 additions and 0 deletions

View File

@@ -0,0 +1,58 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "use_cache": true,
\\ "enable_quality_processing": true,
\\ "ocr": {
\\ "backend": "tesseract",
\\ "language": "eng"
\\ },
\\ "chunking": {
\\ "max_characters": 1000,
\\ "overlap": 200,
\\ "embedding": {
\\ "model": {"type": "preset", "name": "balanced"},
\\ "batch_size": 32,
\\ "normalize": true
\\ }
\\ },
\\ "language_detection": {
\\ "enabled": true,
\\ "min_confidence": 0.8,
\\ "detect_multiple": false
\\ },
\\ "keywords": {
\\ "algorithm": "yake",
\\ "max_keywords": 10,
\\ "min_score": 0.1,
\\ "ngram_range": [1, 3],
\\ "language": "en"
\\ },
\\ "token_reduction": {
\\ "mode": "moderate",
\\ "preserve_important_words": true
\\ },
\\ "postprocessor": {
\\ "enabled": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,30 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "chunking": {
\\ "max_characters": 1000,
\\ "overlap": 200,
\\ "chunker_type": "markdown",
\\ "prepend_heading_context": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,26 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "use_cache": true,
\\ "enable_quality_processing": true
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,25 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
// The Zig binding accepts JSON config strings. To use a discovered config
// file, load it from disk into a string and pass it through unchanged.
const cwd = std.fs.cwd();
const config_json = cwd.readFileAlloc(allocator, "kreuzberg.json", 1 << 20) catch |err| switch (err) {
error.FileNotFound => try allocator.dupe(u8, "{}"),
else => return err,
};
defer allocator.free(config_json);
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{result_json});
}
```

View File

@@ -0,0 +1,28 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "ocr": {
\\ "backend": "tesseract",
\\ "language": "eng"
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,39 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
// Build the config JSON programmatically with std.json.
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
const a = arena.allocator();
var root = std.json.ObjectMap.init(a);
try root.put("use_cache", std.json.Value{ .bool = true });
try root.put("enable_quality_processing", std.json.Value{ .bool = true });
var ocr = std.json.ObjectMap.init(a);
try ocr.put("backend", std.json.Value{ .string = "tesseract" });
try ocr.put("language", std.json.Value{ .string = "eng+deu" });
try root.put("ocr", std.json.Value{ .object = ocr });
var chunking = std.json.ObjectMap.init(a);
try chunking.put("max_characters", std.json.Value{ .integer = 1000 });
try chunking.put("overlap", std.json.Value{ .integer = 200 });
try root.put("chunking", std.json.Value{ .object = chunking });
const config_value = std.json.Value{ .object = root };
var buffer = std.ArrayList(u8).init(a);
try std.json.stringify(config_value, .{}, buffer.writer());
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, buffer.items);
defer std.heap.c_allocator.free(result_json);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{result_json});
}
```

View File

@@ -0,0 +1,25 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "include_document_structure": true
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,25 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "result_format": "element_based"
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,34 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "chunking": {
\\ "max_characters": 1000,
\\ "overlap": 200,
\\ "embedding": {
\\ "model": {"type": "preset", "name": "balanced"},
\\ "batch_size": 16,
\\ "normalize": true,
\\ "show_download_progress": true
\\ }
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,29 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "output_format": "html",
\\ "html_output": {
\\ "theme": "github",
\\ "embed_css": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,31 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "keywords": {
\\ "algorithm": "yake",
\\ "max_keywords": 10,
\\ "min_score": 0.1,
\\ "ngram_range": [1, 3],
\\ "language": "en"
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,29 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "language_detection": {
\\ "enabled": true,
\\ "min_confidence": 0.8,
\\ "detect_multiple": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,32 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "images": {
\\ "extract_images": true,
\\ "target_dpi": 300,
\\ "max_image_dimension": 4096,
\\ "auto_adjust_dpi": true,
\\ "min_dpi": 150,
\\ "max_dpi": 600
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,30 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "pdf_options": {
\\ "extract_images": true,
\\ "passwords": ["password123"],
\\ "extract_metadata": true,
\\ "extract_annotations": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("encrypted.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,32 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "pdf_options": {
\\ "hierarchy": {
\\ "enabled": true,
\\ "k_clusters": 6,
\\ "include_bbox": true,
\\ "ocr_coverage_threshold": 0.5
\\ }
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,28 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "postprocessor": {
\\ "enabled": true,
\\ "enabled_processors": ["whitespace_normalizer", "unicode_normalizer"]
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,26 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "enable_quality_processing": true,
\\ "use_cache": true
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,33 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "ocr": {
\\ "backend": "tesseract",
\\ "language": "eng+deu",
\\ "tesseract_config": {
\\ "language": "eng+deu",
\\ "psm": 6,
\\ "oem": 3
\\ }
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```

View File

@@ -0,0 +1,28 @@
```zig title="Zig"
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "token_reduction": {
\\ "mode": "moderate",
\\ "preserve_important_words": true
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
```