Files
fil/docs/snippets/zig/plugins/plugin_extractor.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

6.5 KiB

const std = @import("std");
const kreuzberg = @import("kreuzberg");

// Mirrors KreuzbergDocumentExtractorVTable from the C FFI.
const DocumentExtractorVTable = extern struct {
    name_fn: ?*const fn (user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void,
    version_fn: ?*const fn (user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void,
    initialize_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
    shutdown_fn: ?*const fn (user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32,
    extract_bytes: ?*const fn (
        user_data: ?*anyopaque,
        content: [*c]const u8,
        content_len: usize,
        mime_type: [*c]const u8,
        config: [*c]const u8,
        out_result: ?*?[*c]u8,
        out_error: ?*?[*c]u8,
    ) callconv(.C) i32,
    extract_file: ?*const fn (
        user_data: ?*anyopaque,
        path: [*c]const u8,
        mime_type: [*c]const u8,
        config: [*c]const u8,
        out_result: ?*?[*c]u8,
        out_error: ?*?[*c]u8,
    ) callconv(.C) i32,
    supported_mime_types: ?*const fn (user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32,
    priority: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
    can_handle: ?*const fn (
        user_data: ?*anyopaque,
        path: [*c]const u8,
        mime_type: [*c]const u8,
    ) callconv(.C) i32,
    as_sync_extractor: ?*const fn (user_data: ?*anyopaque) callconv(.C) i32,
    free_user_data: ?*const fn (user_data: ?*anyopaque) callconv(.C) void,
};

// Simple state struct for the extractor instance.
const SimpleExtractorState = struct {
    source_format: [:0]const u8,
    supported_mimes: [:0]const u8,
};

extern "kreuzberg_ffi" fn kreuzberg_register_document_extractor(
    name: [*c]const u8,
    vtable: DocumentExtractorVTable,
    user_data: ?*anyopaque,
    out_error: ?*?[*c]u8,
) i32;

extern "kreuzberg_ffi" fn kreuzberg_free_string(ptr: [*c]u8) void;

// Callbacks for the custom extractor.
fn extract_bytes_impl(
    user_data: ?*anyopaque,
    content: [*c]const u8,
    content_len: usize,
    _: [*c]const u8,
    _: [*c]const u8,
    out_result: ?*?[*c]u8,
    out_error: ?*?[*c]u8,
) callconv(.C) i32 {
    const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
    _ = state;

    // Minimal extraction: wrap content in JSON.
    var arena = std.heap.ArenaAllocator.init(std.heap.c_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    const content_slice = content[0..content_len];
    const result = std.fmt.allocPrint(
        allocator,
        "{{\"content\": \"{s}\", \"mime_type\": \"application/octet-stream\"}}",
        .{content_slice},
    ) catch {
        if (out_error) |ptr| {
            const err = "OOM during extraction";
            if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
                @memcpy(cstr[0..err.len], err);
                ptr.* = cstr.ptr;
            }
        }
        return 1;
    };

    const result_cstr = std.heap.c_allocator.allocSentinel(u8, result.len, 0) catch {
        if (out_error) |ptr| {
            const err = "OOM allocating result";
            if (std.heap.c_allocator.allocSentinel(u8, err.len, 0)) |cstr| {
                @memcpy(cstr[0..err.len], err);
                ptr.* = cstr.ptr;
            }
        }
        return 1;
    };
    @memcpy(result_cstr[0..result.len], result);

    if (out_result) |ptr| ptr.* = result_cstr.ptr;
    return 0;
}

fn supported_mimes_impl(user_data: ?*anyopaque, out_result: ?*?[*c]u8) callconv(.C) i32 {
    const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
    const mimes = state.supported_mimes;
    const mimes_cstr = std.heap.c_allocator.allocSentinel(u8, mimes.len, 0) catch return 1;
    @memcpy(mimes_cstr[0..mimes.len], mimes);
    if (out_result) |ptr| ptr.* = mimes_cstr.ptr;
    return 0;
}

fn name_impl(user_data: ?*anyopaque, out_name: ?*?[*c]u8) callconv(.C) void {
    _ = user_data;
    const name = "zig-simple-extractor";
    if (std.heap.c_allocator.allocSentinel(u8, name.len, 0)) |cstr| {
        @memcpy(cstr[0..name.len], name);
        if (out_name) |ptr| ptr.* = cstr.ptr;
    }
}

fn version_impl(user_data: ?*anyopaque, out_version: ?*?[*c]u8) callconv(.C) void {
    _ = user_data;
    const version = "0.1.0";
    if (std.heap.c_allocator.allocSentinel(u8, version.len, 0)) |cstr| {
        @memcpy(cstr[0..version.len], version);
        if (out_version) |ptr| ptr.* = cstr.ptr;
    }
}

fn init_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
    _ = user_data;
    _ = out_error;
    return 0;
}

fn shutdown_impl(user_data: ?*anyopaque, out_error: ?*?[*c]u8) callconv(.C) i32 {
    _ = user_data;
    _ = out_error;
    return 0;
}

fn priority_impl(user_data: ?*anyopaque) callconv(.C) i32 {
    _ = user_data;
    return 60; // Higher than default
}

fn cleanup_state(user_data: ?*anyopaque) callconv(.C) void {
    const state: *SimpleExtractorState = @ptrCast(@alignCast(user_data));
    std.heap.c_allocator.free(state.supported_mimes);
    std.heap.c_allocator.destroy(state);
}

pub fn main() !void {
    // Create extractor state on the heap.
    const state = try std.heap.c_allocator.create(SimpleExtractorState);
    state.source_format = try std.heap.c_allocator.dupeZ(u8, "custom");
    state.supported_mimes = try std.heap.c_allocator.dupeZ(u8, "[\"application/octet-stream\"]");

    var out_error: ?[*c]u8 = null;
    defer if (out_error) |ptr| kreuzberg_free_string(ptr);

    // Build and register the vtable.
    const vtable = DocumentExtractorVTable{
        .name_fn = name_impl,
        .version_fn = version_impl,
        .initialize_fn = init_impl,
        .shutdown_fn = shutdown_impl,
        .extract_bytes = extract_bytes_impl,
        .extract_file = null,
        .supported_mime_types = supported_mimes_impl,
        .priority = priority_impl,
        .can_handle = null,
        .as_sync_extractor = null,
        .free_user_data = cleanup_state,
    };

    const rc = kreuzberg_register_document_extractor(
        "zig-simple-extractor",
        vtable,
        state,
        &out_error,
    );

    const stdout = std.io.getStdOut().writer();
    if (rc == 0) {
        try stdout.print("Registered zig-simple-extractor with custom state\n", .{});
    } else {
        if (out_error) |err_ptr| {
            const err_msg = std.mem.sliceTo(err_ptr, 0);
            try stdout.print("Registration failed: {s}\n", .{err_msg});
        } else {
            try stdout.print("Registration failed (no error message)\n", .{});
        }
    }
}