211 lines
7.2 KiB
C#
211 lines
7.2 KiB
C#
|
|
// This file is auto-generated by alef — DO NOT EDIT.
|
||
|
|
// alef:hash:4e15143f4af1ae8bafbdb1506ef057da924484c66a19483966333558ad437e75
|
||
|
|
// To regenerate: alef generate
|
||
|
|
// To verify freshness: alef verify --exit-code
|
||
|
|
// Issues & docs: https://github.com/kreuzberg-dev/alef
|
||
|
|
#nullable enable
|
||
|
|
|
||
|
|
using System;
|
||
|
|
using System.Collections.Generic;
|
||
|
|
using System.Text.Json;
|
||
|
|
using System.Text.Json.Serialization;
|
||
|
|
|
||
|
|
namespace Kreuzberg;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Per-file extraction configuration overrides for batch processing.
|
||
|
|
///
|
||
|
|
/// All fields are `Option<T>` — `None` means "use the batch-level default."
|
||
|
|
/// This type is used with `batch_extract_files` and
|
||
|
|
/// `batch_extract_bytes` to allow heterogeneous
|
||
|
|
/// extraction settings within a single batch.
|
||
|
|
///
|
||
|
|
/// # Excluded Fields
|
||
|
|
///
|
||
|
|
/// The following `ExtractionConfig` fields are batch-level only and
|
||
|
|
/// cannot be overridden per file:
|
||
|
|
/// - `max_concurrent_extractions` — controls batch parallelism
|
||
|
|
/// - `use_cache` — global caching policy
|
||
|
|
/// - `acceleration` — shared ONNX execution provider
|
||
|
|
/// - `security_limits` — global archive security policy
|
||
|
|
/// </summary>
|
||
|
|
public sealed record FileExtractionConfig
|
||
|
|
{
|
||
|
|
/// <summary>
|
||
|
|
/// Override quality post-processing for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("enable_quality_processing")]
|
||
|
|
public bool? EnableQualityProcessing { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override OCR configuration for this file (null in the Option = use batch default).
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("ocr")]
|
||
|
|
public OcrConfig? Ocr { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override force OCR for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("force_ocr")]
|
||
|
|
public bool? ForceOcr { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override force OCR pages for this file (1-indexed page numbers).
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("force_ocr_pages")]
|
||
|
|
public List<uint>? ForceOcrPages { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override disable OCR for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("disable_ocr")]
|
||
|
|
public bool? DisableOcr { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override chunking configuration for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("chunking")]
|
||
|
|
public ChunkingConfig? Chunking { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override content filtering configuration for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("content_filter")]
|
||
|
|
public ContentFilterConfig? ContentFilter { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override image extraction configuration for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("images")]
|
||
|
|
public ImageExtractionConfig? Images { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override PDF options for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("pdf_options")]
|
||
|
|
public PdfConfig? PdfOptions { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override token reduction for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("token_reduction")]
|
||
|
|
public TokenReductionOptions? TokenReduction { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override language detection for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("language_detection")]
|
||
|
|
public LanguageDetectionConfig? LanguageDetection { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override page extraction for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("pages")]
|
||
|
|
public PageConfig? Pages { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override keyword extraction for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("keywords")]
|
||
|
|
public KeywordConfig? Keywords { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override post-processor for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("postprocessor")]
|
||
|
|
public PostProcessorConfig? Postprocessor { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override HTML conversion options for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("html_options")]
|
||
|
|
public string? HtmlOptions { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override result format for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("result_format")]
|
||
|
|
public ResultFormat? ResultFormat { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override output content format for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("output_format")]
|
||
|
|
public OutputFormat? OutputFormat { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override document structure output for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("include_document_structure")]
|
||
|
|
public bool? IncludeDocumentStructure { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override layout detection for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("layout")]
|
||
|
|
public LayoutDetectionConfig? Layout { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override per-file extraction timeout in seconds.
|
||
|
|
///
|
||
|
|
/// When set, the extraction for this file will be canceled after the
|
||
|
|
/// specified duration. A timed-out file produces an error result without
|
||
|
|
/// affecting other files in the batch.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("timeout_secs")]
|
||
|
|
public ulong? TimeoutSecs { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override tree-sitter configuration for this file.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("tree_sitter")]
|
||
|
|
public TreeSitterConfig? TreeSitter { get; init; } = null;
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Override structured extraction configuration for this file.
|
||
|
|
///
|
||
|
|
/// When set, enables LLM-based structured extraction with a JSON schema
|
||
|
|
/// for this specific file. The extracted content is sent to a VLM/LLM
|
||
|
|
/// and the response is parsed according to the provided schema.
|
||
|
|
/// </summary>
|
||
|
|
[JsonPropertyName("structured_extraction")]
|
||
|
|
public StructuredExtractionConfig? StructuredExtraction { get; init; } = null;
|
||
|
|
|
||
|
|
|
||
|
|
/// <summary>
|
||
|
|
/// Parse a <see cref="FileExtractionConfig"/> from a JSON string.
|
||
|
|
/// </summary>
|
||
|
|
/// <exception cref="KreuzbergException">When the JSON cannot be deserialised.</exception>
|
||
|
|
public static FileExtractionConfig FromJson(string json)
|
||
|
|
{
|
||
|
|
try
|
||
|
|
{
|
||
|
|
return JsonSerializer.Deserialize<FileExtractionConfig>(json, JsonOptions)
|
||
|
|
?? throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: deserializer returned null");
|
||
|
|
}
|
||
|
|
catch (KreuzbergException)
|
||
|
|
{
|
||
|
|
throw;
|
||
|
|
}
|
||
|
|
catch (Exception e)
|
||
|
|
{
|
||
|
|
throw new KreuzbergException($"Failed to parse FileExtractionConfig from JSON: {e.Message}", e);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private static readonly JsonSerializerOptions JsonOptions = new()
|
||
|
|
{
|
||
|
|
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault,
|
||
|
|
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
|
||
|
|
};
|
||
|
|
|
||
|
|
/// <summary>Options for serializing config/input objects to FFI. Strips nulls
|
||
|
|
/// (nullable C# fields default to null and would override required Rust fields with
|
||
|
|
/// non-deserialisable nulls) but preserves explicit false/0 so caller intent is kept.</summary>
|
||
|
|
private static readonly JsonSerializerOptions JsonSerializationOptions = new()
|
||
|
|
{
|
||
|
|
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull,
|
||
|
|
Converters = { new JsonStringEnumConverter(JsonNamingPolicy.SnakeCaseLower) },
|
||
|
|
};
|
||
|
|
}
|