198 lines
6.5 KiB
Markdown
198 lines
6.5 KiB
Markdown
```zig title="Zig"
|
|
const std = @import("std");
|
|
const kreuzberg = @import("kreuzberg");
|
|
|
|
// Mirrors KreuzbergDocumentExtractorVTable from the C FFI.
|
|
const DocumentExtractorVTable = extern struct {
|
|
name_fn: ?*const fn (user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void,
|
|
version_fn: ?*const fn (user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void,
|
|
initialize_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
|
shutdown_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
|
|
extract_bytes: ?*const fn (
|
|
user_data: ?*anyopaque,
|
|
content: [*c]const u8,
|
|
content_len: usize,
|
|
mime_type: [*c]const u8,
|
|
config: [*c]const u8,
|
|
out_result: ?*?[*c]u8,
|
|
out_error: ?*?[*c]u8,
|
|
) callconv(.C) i32,
|
|
extract_file: ?*const fn (
|
|
user_data: ?*anyopaque,
|
|
path: [*c]const u8,
|
|
mime_type: [*c]const u8,
|
|
config: [*c]const u8,
|
|
out_result: ?*?[*c]u8,
|
|
out_error: ?*?[*c]u8,
|
|
) callconv(.C) i32,
|
|
supported_mime_types: ?*const fn (user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32,
|
|
priority: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
|
can_handle: ?*const fn (
|
|
user_data: ?*anyopaque,
|
|
path: [*c]const u8,
|
|
mime_type: [*c]const u8,
|
|
) callconv(.C) i32,
|
|
as_sync_extractor: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
|
|
free_user_data: ?*const fn (user_data: ?*anyopaque) callconv(.C) void,
|
|
};
|
|
|
|
// Simple state struct for the extractor instance.
|
|
const SimpleExtractorState = struct {
|
|
source_format: [:0]const u8,
|
|
supported_mimes: [:0]const u8,
|
|
};
|
|
|
|
extern "kreuzberg_ffi" fn kreuzberg_register_document_extractor(
|
|
name: [*c]const u8,
|
|
vtable: DocumentExtractorVTable,
|
|
user_data: ?*anyopaque,
|
|
out_error: ?*?[*c]u8,
|
|
) i32;
|
|
|
|
extern "kreuzberg_ffi" fn kreuzberg_free_string(ptr: [*c]u8) void;
|
|
|
|
// Callbacks for the custom extractor.
|
|
fn extract_bytes_impl(
|
|
user_data: ?*anyopaque,
|
|
content: [*c]const u8,
|
|
content_len: usize,
|
|
_: [*c]const u8,
|
|
_: [*c]const u8,
|
|
out_result: ?*?[*c]u8,
|
|
out_error: ?*?[*c]u8,
|
|
) callconv(.C) i32 {
|
|
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
|
_ = state;
|
|
|
|
// Minimal extraction: wrap content in JSON.
|
|
var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
|
|
defer arena.deinit();
|
|
const allocator = arena.allocator();
|
|
|
|
const content_slice = content[0..content_len];
|
|
const result = std.fmt.allocPrint(
|
|
allocator,
|
|
"{{\"content\": \"{s}\", \"mime_type\": \"application/octet-stream\"}}",
|
|
.{content_slice},
|
|
) catch {
|
|
if (out_error) |ptr| {
|
|
const err = "OOM during extraction";
|
|
if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
|
|
@memcpy(cstr[0..err.len], err);
|
|
ptr.* = cstr.ptr;
|
|
}
|
|
}
|
|
return 1;
|
|
};
|
|
|
|
const result_cstr = std.heap.c_allocator.allocSentinel(u8, result.len, 0) catch {
|
|
if (out_error) |ptr| {
|
|
const err = "OOM allocating result";
|
|
if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
|
|
@memcpy(cstr[0..err.len], err);
|
|
ptr.* = cstr.ptr;
|
|
}
|
|
}
|
|
return 1;
|
|
};
|
|
@memcpy(result_cstr[0..result.len], result);
|
|
|
|
if (out_result) |ptr| ptr.* = result_cstr.ptr;
|
|
return 0;
|
|
}
|
|
|
|
fn supported_mimes_impl(user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32 {
|
|
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
|
const mimes = state.supported_mimes;
|
|
const mimes_cstr = std.heap.c_allocator.allocSentinel(u8, mimes.len, 0) catch return 1;
|
|
@memcpy(mimes_cstr[0..mimes.len], mimes);
|
|
if (out_result) |ptr| ptr.* = mimes_cstr.ptr;
|
|
return 0;
|
|
}
|
|
|
|
fn name_impl(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
|
|
_ = user_data;
|
|
const name = "zig-simple-extractor";
|
|
if (std.heap.c_allocator.allocSentinel(u8, name.len, 0)) |cstr| {
|
|
@memcpy(cstr[0..name.len], name);
|
|
if (out_name) |ptr| ptr.* = cstr.ptr;
|
|
}
|
|
}
|
|
|
|
fn version_impl(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
|
|
_ = user_data;
|
|
const version = "0.1.0";
|
|
if (std.heap.c_allocator.allocSentinel(u8, version.len, 0)) |cstr| {
|
|
@memcpy(cstr[0..version.len], version);
|
|
if (out_version) |ptr| ptr.* = cstr.ptr;
|
|
}
|
|
}
|
|
|
|
fn init_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
|
_ = user_data;
|
|
_ = out_error;
|
|
return 0;
|
|
}
|
|
|
|
fn shutdown_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
|
|
_ = user_data;
|
|
_ = out_error;
|
|
return 0;
|
|
}
|
|
|
|
fn priority_impl(user_data: ?*anyopaque) callconv(.C) i32 {
|
|
_ = user_data;
|
|
return 60; // Higher than default
|
|
}
|
|
|
|
fn cleanup_state(user_data: ?*anyopaque) callconv(.C) void {
|
|
const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
|
|
std.heap.c_allocator.free(state.supported_mimes);
|
|
std.heap.c_allocator.destroy(state);
|
|
}
|
|
|
|
pub fn main() !void {
|
|
// Create extractor state on the heap.
|
|
const state = try std.heap.c_allocator.create(SimpleExtractorState);
|
|
state.source_format = try std.heap.c_allocator.dupeZ(u8, "custom");
|
|
state.supported_mimes = try std.heap.c_allocator.dupeZ(u8, "[\"application/octet-stream\"]");
|
|
|
|
var out_error: ?[*c]u8 = null;
|
|
defer if (out_error) |ptr| kreuzberg_free_string(ptr);
|
|
|
|
// Build and register the vtable.
|
|
const vtable = DocumentExtractorVTable{
|
|
.name_fn = name_impl,
|
|
.version_fn = version_impl,
|
|
.initialize_fn = init_impl,
|
|
.shutdown_fn = shutdown_impl,
|
|
.extract_bytes = extract_bytes_impl,
|
|
.extract_file = null,
|
|
.supported_mime_types = supported_mimes_impl,
|
|
.priority = priority_impl,
|
|
.can_handle = null,
|
|
.as_sync_extractor = null,
|
|
.free_user_data = cleanup_state,
|
|
};
|
|
|
|
const rc = kreuzberg_register_document_extractor(
|
|
"zig-simple-extractor",
|
|
vtable,
|
|
state,
|
|
&out_error,
|
|
);
|
|
|
|
const stdout = std.io.getStdOut().writer();
|
|
if (rc == 0) {
|
|
try stdout.print("Registered zig-simple-extractor with custom state\n", .{});
|
|
} else {
|
|
if (out_error) |err_ptr| {
|
|
const err_msg = std.mem.sliceTo(err_ptr, 0);
|
|
try stdout.print("Registration failed: {s}\n", .{err_msg});
|
|
} else {
|
|
try stdout.print("Registration failed (no error message)\n", .{});
|
|
}
|
|
}
|
|
}
|
|
```
|