Files
fil/docs/snippets/go/plugins/pdf_only_processor.md
Henrik Jess Nielsen b4c07d3693
All checks were successful
Deploy fil (kreuzberg) / deploy (push) Successful in 49s
Nomad changes
2026-06-01 23:40:55 +02:00

3.3 KiB

package main

import (
	"encoding/json"
	"log"
	"unsafe"

	"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

/*
#cgo CFLAGS: -I${SRCDIR}/../../../crates/kreuzberg-ffi
#cgo LDFLAGS: -L${SRCDIR}/../../../target/release -L${SRCDIR}/../../../target/debug -lkreuzberg_ffi
#include "../../../crates/kreuzberg-ffi/kreuzberg.h"
#include <stdlib.h>
*/
import "C"

// pdfOnlyProcessor applies PDF-specific processing logic only to PDF documents
//export pdfOnlyProcessor
func pdfOnlyProcessor(resultJSON *C.char) *C.char {
	jsonStr := C.GoString(resultJSON)
	var result map[string]interface{}

	if err := json.Unmarshal([]byte(jsonStr), &result); err != nil {
		return C.CString("{\"error\":\"Failed to parse result JSON\"}")
	}

	// Check MIME type - only process PDFs
	mimeType, ok := result["mime_type"].(string)
	if !ok || mimeType != "application/pdf" {
		// Return unchanged for non-PDF documents
		outputJSON, err := json.Marshal(result)
		if err != nil {
			return C.CString("{\"error\":\"Failed to serialize result\"}")
		}
		return C.CString(string(outputJSON))
	}

	// Perform PDF-specific processing
	metadata, ok := result["metadata"].(map[string]interface{})
	if !ok {
		metadata = make(map[string]interface{})
	}

	// Example PDF-specific processing:
	// - Extract tables as structured data
	// - Handle PDF-specific formatting
	// - Preserve document hierarchy

	metadata["pdf_specific_processing"] = true
	metadata["processor_type"] = "pdf_only"

	// Check for tables in PDF
	if tablesJSON, ok := result["tables_json"].(string); ok && tablesJSON != "" {
		var tables []interface{}
		if err := json.Unmarshal([]byte(tablesJSON), &tables); err == nil {
			metadata["table_count"] = len(tables)
		}
	}

	result["metadata"] = metadata

	// Serialize back to JSON
	outputJSON, err := json.Marshal(result)
	if err != nil {
		return C.CString("{\"error\":\"Failed to serialize result\"}")
	}

	return C.CString(string(outputJSON))
}

func main() {
	// Register the post-processor with priority 70
	if err := kreuzberg.RegisterPostProcessor("pdf_only_processor", 70,
		(C.PostProcessorCallback)(C.pdfOnlyProcessor)); err != nil {
		log.Fatalf("failed to register post-processor: %v", err)
	}
	defer func() {
		if err := kreuzberg.UnregisterPostProcessor("pdf_only_processor"); err != nil {
			log.Printf("warning: failed to unregister post-processor: %v", err)
		}
	}()

	// Process multiple documents - processor will only affect PDFs
	files := []string{
		"document.pdf",
		"image.jpg",
		"spreadsheet.xlsx",
	}

	for _, file := range files {
		result, err := kreuzberg.ExtractFileSync(file, nil)
		if err != nil {
			log.Printf("Warning: extraction failed for %s: %v", file, err)
			continue
		}

		// Parse metadata to check if PDF processing occurred
		var metadata map[string]interface{}
		if metaJSON, ok := result.MetadataJSON.(string); ok {
			if err := json.Unmarshal([]byte(metaJSON), &metadata); err == nil {
				if pdfProcessing, ok := metadata["pdf_specific_processing"].(bool); ok && pdfProcessing {
					log.Printf("PDF-specific processing applied to: %s", file)
					if tableCount, ok := metadata["table_count"].(float64); ok {
						log.Printf("  Tables found: %.0f", tableCount)
					}
				} else {
					log.Printf("Skipped PDF processor for: %s (MIME: %s)", file, result.MimeType)
				}
			}
		}
	}
}